Till Fischer
commited on
Commit
·
cac53d2
1
Parent(s):
8aac46d
Fix PunktTokenizer für Hugging Face Space
Browse files- analyze_aspects.py +9 -4
analyze_aspects.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
|
| 4 |
#python /Users/fischer/Desktop/HanserMVP/scraping/analyze_aspects.py --isbn "9783446264199" --db-path /Users/fischer/Desktop/buch_datenbank.sqlite --languages de
|
| 5 |
# python analyze_aspects.py --isbn "9783446264199" --db-path /Pfad/zur/sqlite.db --languages de
|
| 6 |
-
|
| 7 |
import sqlite3
|
| 8 |
import argparse
|
| 9 |
import logging
|
|
@@ -13,7 +13,6 @@ from transformers import pipeline
|
|
| 13 |
from collections import defaultdict
|
| 14 |
import matplotlib.pyplot as plt
|
| 15 |
|
| 16 |
-
|
| 17 |
def visualize_aspects(aspect_results: dict[str, list[float]], output_dir: Path, filename: str = "sentiment_aspekte.png"):
|
| 18 |
output_dir.mkdir(parents=True, exist_ok=True)
|
| 19 |
|
|
@@ -42,7 +41,7 @@ def visualize_aspects(aspect_results: dict[str, list[float]], output_dir: Path,
|
|
| 42 |
|
| 43 |
|
| 44 |
# NLTK punkt model for sentence tokenization
|
| 45 |
-
nltk.download('punkt')
|
| 46 |
from nltk import sent_tokenize
|
| 47 |
|
| 48 |
# Logging Configuration
|
|
@@ -119,7 +118,13 @@ def analyze_quickwin(db_path: Path, isbn: str, device: int = -1, languages: list
|
|
| 119 |
continue
|
| 120 |
|
| 121 |
logger.info(f"Review ID {review_id} ({lang}) wird verarbeitet.")
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
if lang == 'de':
|
| 125 |
aspect_map = ASPECT_LABEL_MAP
|
|
|
|
| 3 |
|
| 4 |
#python /Users/fischer/Desktop/HanserMVP/scraping/analyze_aspects.py --isbn "9783446264199" --db-path /Users/fischer/Desktop/buch_datenbank.sqlite --languages de
|
| 5 |
# python analyze_aspects.py --isbn "9783446264199" --db-path /Pfad/zur/sqlite.db --languages de
|
| 6 |
+
# Fixing Punkt tokenizer bug
|
| 7 |
import sqlite3
|
| 8 |
import argparse
|
| 9 |
import logging
|
|
|
|
| 13 |
from collections import defaultdict
|
| 14 |
import matplotlib.pyplot as plt
|
| 15 |
|
|
|
|
| 16 |
def visualize_aspects(aspect_results: dict[str, list[float]], output_dir: Path, filename: str = "sentiment_aspekte.png"):
|
| 17 |
output_dir.mkdir(parents=True, exist_ok=True)
|
| 18 |
|
|
|
|
| 41 |
|
| 42 |
|
| 43 |
# NLTK punkt model for sentence tokenization
|
| 44 |
+
nltk.download('punkt', download_dir='/home/user/nltk_data')
|
| 45 |
from nltk import sent_tokenize
|
| 46 |
|
| 47 |
# Logging Configuration
|
|
|
|
| 118 |
continue
|
| 119 |
|
| 120 |
logger.info(f"Review ID {review_id} ({lang}) wird verarbeitet.")
|
| 121 |
+
import os
|
| 122 |
+
nltk.download('punkt')
|
| 123 |
+
nltk.data.path.append("/home/user/nltk_data")
|
| 124 |
+
os.environ["NLTK_DATA"] = "/home/user/nltk_data"
|
| 125 |
+
|
| 126 |
+
lang_map = {'de': 'german', 'en': 'english'}
|
| 127 |
+
sentences = sent_tokenize(text, language=lang_map.get(lang, 'english'))
|
| 128 |
|
| 129 |
if lang == 'de':
|
| 130 |
aspect_map = ASPECT_LABEL_MAP
|