#!/usr/bin/env python3 # analyze_aspects.py #python /Users/fischer/Desktop/HanserMVP/scraping/analyze_aspects.py --isbn "9783446264199" --db-path /Users/fischer/Desktop/buch_datenbank.sqlite --languages de # python analyze_aspects.py --isbn "9783446264199" --db-path /Pfad/zur/sqlite.db --languages de # Fixing Punkt tokenizer bug import sqlite3 import argparse import logging from pathlib import Path import nltk from transformers import pipeline from collections import defaultdict import matplotlib.pyplot as plt import os nltk.download('punkt') def visualize_aspects(aspect_results: dict[str, list[float]], output_dir: Path, filename: str = "sentiment_aspekte.png"): output_dir.mkdir(parents=True, exist_ok=True) aspects = list(aspect_results.keys()) avg_scores = [sum(scores) / len(scores) for scores in aspect_results.values()] colors = ['green' if score > 0.1 else 'red' if score < -0.1 else 'gray' for score in avg_scores] plt.figure(figsize=(10, 6)) bars = plt.barh(aspects, avg_scores, color=colors) plt.axvline(x=0, color='black', linewidth=0.8) plt.xlabel("Durchschnittlicher Sentiment-Score") plt.title("Sentiment-Analyse pro Aspekt") for bar, score in zip(bars, avg_scores): plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height() / 2, f"{score:.2f}", va='center') plt.tight_layout() plt.gca().invert_yaxis() output_path = output_dir / filename plt.savefig(output_path, dpi=300) plt.close() logger.info(f"Diagramm gespeichert unter: {output_path}") # NLTK punkt model for sentence tokenization nltk.download('punkt', download_dir='/home/user/nltk_data') from nltk.tokenize import sent_tokenize # Logging Configuration def configure_logging(): logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') return logging.getLogger(__name__) logger = configure_logging() # Aspekt-Label-Maps ASPECT_LABEL_MAP = { "Handlung": ["Handlung", "Plot", "Story", "Aufbau"], "Charaktere": ["Charaktere", "Figuren", "Protagonisten", "Nebenfiguren", "Beziehungen"], "Stil": ["Stil", "Sprachstil", "Sprache", "Erzählweise"], "Emotionale Wirkung": ["Lesevergnügen", "Berührend", "Bewegend", "Begeisternd", "Spannend"], "Tiefgang": ["Tiefgang", "Nachdenklich", "Philosophisch", "kritisch"], "Thema & Kontext": ["Thema", "Motiv", "Zeitgeschehen", "Historischer Kontext", "Gesellschaft"], "Originalität": ["Originalität", "Kreativität", "Innovativ", "Idee", 'Humor'], "Recherche & Authentizität": ["Recherche", "Authentizität", "Realismus", "Fakten"] } ASPECT_LABEL_MAP_EN = { "Plot": ["Plot", "Story", "Narrative", "Structure"], "Characters": ["Characters", "Protagonists", "Antagonists", "Relationships"], "Style": ["Style", "Language", "Tone", "Narration"], "Emotional Impact": ["Touching", "Funny", "Exciting", "Moving", "Engaging"], "Depth": ["Philosophical", "Thought-provoking", "Insightful", "Critical"], "Theme & Context": ["Theme", "Motif", "Historical Context", "Social Issues"], "Originality": ["Originality", "Creativity", "Innovation", "Idea"], "Research & Authenticity": ["Research", "Authenticity", "Realism", "Facts"] } ALL_LABELS = [label for labels in ASPECT_LABEL_MAP.values() for label in labels] # --- Datenbankzugriff --- def load_reviews(db_path: Path, isbn: str) -> list: conn = sqlite3.connect(db_path) cursor = conn.cursor() cursor.execute( "SELECT id, cleaned_text, cleaned_text_en FROM reviews_und_notizen WHERE buch_isbn = ?", (isbn,) ) rows = cursor.fetchall() conn.close() texts_to_analyze = [] for review_id, text_de, text_en in rows: if text_de and isinstance(text_de, str): texts_to_analyze.append((review_id, text_de, 'de')) if text_en and isinstance(text_en, str): texts_to_analyze.append((review_id, text_en, 'en')) return texts_to_analyze # --- Analysefunktion --- def analyze_quickwin(db_path: Path, isbn: str, device: int = -1, languages: list[str] = ["de", "en"]) -> dict: reviews = load_reviews(db_path, isbn) reviews = [r for r in reviews if r[2] in languages] if not reviews: logger.warning(f"Keine gesäuberten Reviews für ISBN {isbn} in den gewählten Sprachen gefunden.") return {} zsl = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device, multi_label=True) sent_de = pipeline("sentiment-analysis", model="oliverguhr/german-sentiment-bert", device=device) sent_en = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device) aspect_results = defaultdict(list) total_aspects = 0 for review_id, text, lang in reviews: if not text: continue logger.info(f"Review ID {review_id} ({lang}) wird verarbeitet.") lang_map = {'de': 'german', 'en': 'english'} sentences = sent_tokenize(text, language=lang_map.get(lang, 'english')) if lang == 'de': aspect_map = ASPECT_LABEL_MAP all_labels = ALL_LABELS sent_pipeline = sent_de hypothesis_template = "Dieser Satz handelt von {}." elif lang == 'en': aspect_map = ASPECT_LABEL_MAP_EN all_labels = [label for labels in aspect_map.values() for label in labels] sent_pipeline = sent_en hypothesis_template = "This sentence is about {}." else: continue for sent in sentences: if not sent.strip() or len(sent) < 15: continue result = zsl(sent, candidate_labels=all_labels, hypothesis_template=hypothesis_template) main_label = "" best_score = 0.0 for label, score in zip(result["labels"], result["scores"]): if score > 0.8: main_label = next((k for k, v in aspect_map.items() if label in v), label) best_score = score break if not main_label: continue ml_sentiment = sent_pipeline(sent)[0] ml_score = ml_sentiment['score'] if ml_sentiment['label'].upper().startswith('POS') else -ml_sentiment['score'] final_score = ml_score final_label = 'POS' if final_score > 0.1 else 'NEG' if final_score < -0.1 else 'NEU' print( f"Review {review_id} ({lang}) | Satz: {sent}\n" f" Aspekt: {main_label} (via '{result['labels'][0]}', {best_score:.2f}) | " f"ML: {ml_sentiment['label']}({ml_sentiment['score']:.2f}) -> Final: {final_label}({final_score:.2f})" ) aspect_results[main_label].append(final_score) total_aspects += 1 logger.info(f"Total aspects found: {total_aspects}") return aspect_results # --- Entry Point --- def main(): parser = argparse.ArgumentParser(description="Quick-Win ABSA ohne SentiWS") parser.add_argument("--db-path", required=True, help="Pfad zur SQLite-Datenbank") parser.add_argument("--isbn", required=True, help="ISBN des Buchs") parser.add_argument("--gpu", action="store_true", help="GPU verwenden (device=0)") parser.add_argument("--languages", nargs="+", choices=["de", "en"], default=["de", "en"], help="Sprachen der Reviews, z. B. --languages de oder --languages de en") args = parser.parse_args() device = 0 if args.gpu else -1 aspect_results = analyze_quickwin( Path(args.db_path), args.isbn, device=device, languages=args.languages ) if aspect_results: output_dir = Path("output") visualize_aspects(aspect_results, output_dir) else: logger.info("Keine Aspekt-Daten zur Visualisierung verfügbar.")