Spaces:

Zevir
/

SERASA_BERT_OCR

Sleeping

App Files Files Community

Zevir commited on 22 days ago

Commit

da8ca36

1 Parent(s): b3f6538

new dataset

Browse files

Files changed (10) hide show

app/{app.py → main.py} +0 -0
app/model/config.json +2 -2
app/model/model.safetensors +1 -1
app/model/special_tokens_map.json +2 -2
app/model/tokenizer_config.json +2 -2
train/__init__.py +0 -0
train/evaluate_model.py +98 -0
{app → train}/train_finetune.py +66 -30
train/utils/__pycache__/preprocess.cpython-312.pyc +0 -0
train/utils/preprocess.py +11 -0

app/{app.py → main.py} RENAMED Viewed

File without changes

app/model/config.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dc5dd73b5b7f525c447c56cf92780ba98e78c95c7dfaff7e92cb0d5531d0b82f
-size 866

 version https://git-lfs.github.com/spec/v1
+oid sha256:39d172162dc11165b0407c9bf149e7a59099ff4e9c1b28183bdd07b27ff95944
+size 898

app/model/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:27642721c62ef81f6961c3051c3246035166f90d888b853f4e91e2ca8ae3c460
 size 435722224

 version https://git-lfs.github.com/spec/v1
+oid sha256:83503a45ba59d26dadc5ce8961d0e4ee4948be9bb82d0cf23c47a33ee3264e31
 size 435722224

app/model/special_tokens_map.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b6d346be366a7d1d48332dbc9fdf3bf8960b5d879522b7799ddba59e76237ee3
-size 125

 version https://git-lfs.github.com/spec/v1
+oid sha256:3c3507f36dff57bce437223db3b3081d1e2b52ec3e56ee55438193ecb2c94dd6
+size 132

app/model/tokenizer_config.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8ab044e4a71cdb2a5cff548e16d3bcd46a757848ef861743426c44a134b00da1
-size 1301

 version https://git-lfs.github.com/spec/v1
+oid sha256:7525d84e2d59af28db08243599eb1da698db5f415be9fc1f5a41332d27be405b
+size 1359

train/__init__.py ADDED Viewed

File without changes

train/evaluate_model.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import os
+import torch
+from torch.utils.data import DataLoader, Dataset
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+from utils.preprocess import preprocess_text
+FAKE_DIR = "data/fake_news/financeiros"
+REAL_DIR = "data/real_news/financeiros"
+MODEL_DIR = "app/model"
+MAX_LEN = 256
+BATCH_SIZE = 8
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# ========= LOAD DATA =========
+def load_texts(directory, label):
+    samples = []
+    for root, _, files in os.walk(directory):
+        for fname in files:
+            if fname.endswith(".txt"):
+                path = os.path.join(root, fname)
+                with open(path, "r", encoding="utf-8") as f:
+                    text = preprocess_text(f.read())
+                    samples.append((text, label))
+    return samples
+def load_dataset():
+    fake = load_texts(FAKE_DIR, 0)
+    real = load_texts(REAL_DIR, 1)
+    data = fake + real
+    texts = [t for t, _ in data]
+    labels = [l for _, l in data]
+    return texts, labels
+# ========= DATASET =========
+class NewsDataset(Dataset):
+    def __init__(self, texts, labels, tokenizer):
+        self.texts = texts
+        self.labels = labels
+        self.tok = tokenizer
+    def __len__(self):
+        return len(self.texts)
+    def __getitem__(self, idx):
+        enc = self.tok(
+            self.texts[idx],
+            truncation=True,
+            padding="max_length",
+            max_length=MAX_LEN,
+            return_tensors="pt"
+        )
+        enc = {k: v.squeeze() for k, v in enc.items()}
+        enc["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
+        return enc
+# ========= EVALUATION =========
+def evaluate():
+    print("Carregando modelo...")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to(device)
+    print("Carregando dataset...")
+    texts, labels = load_dataset()
+    dataset = NewsDataset(texts, labels, tokenizer)
+    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)
+    model.eval()
+    preds = []
+    true_labels = []
+    print("\nAvaliando...\n")
+    with torch.no_grad():
+        for batch in loader:
+            batch = {k: v.to(device) for k, v in batch.items()}
+            outputs = model(**batch)
+            p = torch.argmax(outputs.logits, dim=1).cpu().numpy()
+            l = batch["labels"].cpu().numpy()
+            preds.extend(p)
+            true_labels.extend(l)
+    # === METRICS ===
+    acc = accuracy_score(true_labels, preds)
+    print(f"Accuracy: {acc:.4f}")
+    print("\nClassification Report:")
+    print(classification_report(true_labels, preds, target_names=["Fake", "Real"]))
+    print("\nConfusion Matrix:")
+    print(confusion_matrix(true_labels, preds))
+if __name__ == "__main__":
+    evaluate()

{app → train}/train_finetune.py RENAMED Viewed

@@ -1,20 +1,23 @@
 import os
 import torch
 from torch.utils.data import DataLoader, Dataset
-from torch.optim import AdamW
 from transformers import (
     AutoTokenizer,
     AutoModelForSequenceClassification,
     get_linear_schedule_with_warmup
 )
 from tqdm import tqdm
 import random
-from app.preprocess import preprocess_text
-# CONFIGURAÇÕES
 MODEL_NAME = "neuralmind/bert-base-portuguese-cased"
 OUTPUT_DIR = "app/model"
 FAKE_DIR = "data/fake_news/financeiros"
 REAL_DIR = "data/real_news/financeiros"
@@ -24,11 +27,9 @@ LR = 2e-5
 MAX_LEN = 256
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"🔥 Treinando em: {device}")
-# FUNÇÕES AUXILIARES
 def load_texts_from_dir(directory, label):
-    """Lê recursivamente todos os .txt em todas as subpastas."""
     samples = []
     for root, _, files in os.walk(directory):
@@ -37,32 +38,26 @@ def load_texts_from_dir(directory, label):
                 path = os.path.join(root, fname)
                 try:
                     with open(path, "r", encoding="utf-8") as f:
-                        text = f.read()
-                        text = preprocess_text(text)
                         samples.append((text, label))
                 except Exception as e:
-                    print(f"⚠ Erro ao ler {path}: {e}")
     return samples
 def load_dataset():
-    """Carrega fake e real em formato único."""
-    print("📂 Carregando dados das pastas...")
     fake = load_texts_from_dir(FAKE_DIR, 0)
     real = load_texts_from_dir(REAL_DIR, 1)
-    dataset = fake + real
-    random.shuffle(dataset)
-    print(f"✔ Total Fake: {len(fake)}")
-    print(f"✔ Total Real: {len(real)}")
-    print(f"✔ Total: {len(dataset)}")
-    texts, labels = zip(*dataset)
     return list(texts), list(labels)
-# DATASET DO TORCH
 class NewsDataset(Dataset):
     def __init__(self, texts, labels, tokenizer):
         self.texts = texts
@@ -84,47 +79,88 @@ class NewsDataset(Dataset):
         encoded["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
         return encoded
-# PROCESSO DE TREINAMENTO
 def train():
     texts, labels = load_dataset()
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)
-    dataset = NewsDataset(texts, labels, tokenizer)
-    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
     optimizer = AdamW(model.parameters(), lr=LR)
-    total_steps = len(loader) * EPOCHS
     scheduler = get_linear_schedule_with_warmup(optimizer, 0, total_steps)
-    print("\n🚀 Iniciando fine-tuning do BERT...\n")
     model.train()
     for epoch in range(EPOCHS):
-        print(f"\n===== Época {epoch+1}/{EPOCHS} =====")
         epoch_loss = 0
-        for batch in tqdm(loader):
             batch = {k: v.to(device) for k, v in batch.items()}
             outputs = model(**batch)
             loss = outputs.loss
-            epoch_loss += loss.item()
             loss.backward()
             optimizer.step()
             scheduler.step()
             optimizer.zero_grad()
-        print(f"📉 Loss da época: {epoch_loss / len(loader):.4f}")
     os.makedirs(OUTPUT_DIR, exist_ok=True)
     model.save_pretrained(OUTPUT_DIR)
     tokenizer.save_pretrained(OUTPUT_DIR)
-    print(f"\n🎉 Modelo salvo em: {OUTPUT_DIR}\n")
 if __name__ == "__main__":

 import os
 import torch
 from torch.utils.data import DataLoader, Dataset
+from torch.optim import AdamW
 from transformers import (
     AutoTokenizer,
     AutoModelForSequenceClassification,
     get_linear_schedule_with_warmup
 )
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report, accuracy_score
 from tqdm import tqdm
+import numpy as np
 import random
+from utils.preprocess import preprocess_text
 MODEL_NAME = "neuralmind/bert-base-portuguese-cased"
 OUTPUT_DIR = "app/model"
 FAKE_DIR = "data/fake_news/financeiros"
 REAL_DIR = "data/real_news/financeiros"
 MAX_LEN = 256
 device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Treinando em: {device}")
 def load_texts_from_dir(directory, label):
     samples = []
     for root, _, files in os.walk(directory):
                 path = os.path.join(root, fname)
                 try:
                     with open(path, "r", encoding="utf-8") as f:
+                        text = preprocess_text(f.read())
                         samples.append((text, label))
                 except Exception as e:
+                    print(f"Erro ao ler {path}: {e}")
     return samples
 def load_dataset():
     fake = load_texts_from_dir(FAKE_DIR, 0)
     real = load_texts_from_dir(REAL_DIR, 1)
+    all_data = fake + real
+    random.shuffle(all_data)
+    print(f"Fake: {len(fake)} | Real: {len(real)} | Total: {len(all_data)}")
+    texts, labels = zip(*all_data)
     return list(texts), list(labels)
 class NewsDataset(Dataset):
     def __init__(self, texts, labels, tokenizer):
         self.texts = texts
         encoded["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
         return encoded
 def train():
     texts, labels = load_dataset()
+    # SEPARAÇÃO REAL entre treino, validação e teste
+    X_train, X_test, y_train, y_test = train_test_split(
+        texts, labels, test_size=0.20, stratify=labels, random_state=42
+    )
+    X_train, X_val, y_train, y_val = train_test_split(
+        X_train, y_train, test_size=0.10, stratify=y_train, random_state=42
+    )
+    print(f"\nTreino: {len(X_train)} | Val: {len(X_val)} | Teste: {len(X_test)}\n")
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    model = AutoModelForSequenceClassification.from_pretrained(
+        MODEL_NAME, num_labels=2
+    ).to(device)
+    # LOADERS
+    train_dataset = NewsDataset(X_train, y_train, tokenizer)
+    val_dataset = NewsDataset(X_val, y_val, tokenizer)
+    test_dataset = NewsDataset(X_test, y_test, tokenizer)
+    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
+    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
     optimizer = AdamW(model.parameters(), lr=LR)
+    total_steps = len(train_loader) * EPOCHS
     scheduler = get_linear_schedule_with_warmup(optimizer, 0, total_steps)
+    print("Iniciando fine-tuning...\n")
     model.train()
     for epoch in range(EPOCHS):
+        print(f"=== Época {epoch+1}/{EPOCHS} ===")
         epoch_loss = 0
+        for batch in tqdm(train_loader):
             batch = {k: v.to(device) for k, v in batch.items()}
             outputs = model(**batch)
             loss = outputs.loss
+            epoch_loss += loss.item()
             loss.backward()
             optimizer.step()
             scheduler.step()
             optimizer.zero_grad()
+        print(f"Loss da época: {epoch_loss / len(train_loader):.4f}")
+    print("\nAvaliando...")
+    model.eval()
+    all_preds = []
+    all_true = []
+    with torch.no_grad():
+        for batch in tqdm(test_loader):
+            labels = batch["labels"].numpy()
+            inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
+            outputs = model(**inputs)
+            preds = outputs.logits.argmax(dim=1).cpu().numpy()
+            all_preds.extend(preds)
+            all_true.extend(labels)
+    # MÉTRICAS
+    print("\n=== Classification Report ===")
+    print(classification_report(all_true, all_preds, target_names=["Fake", "Real"]))
+    print("Accuracy:", accuracy_score(all_true, all_preds))
+    # SALVAR MODELO
     os.makedirs(OUTPUT_DIR, exist_ok=True)
     model.save_pretrained(OUTPUT_DIR)
     tokenizer.save_pretrained(OUTPUT_DIR)
+    print(f"\nModelo salvo em: {OUTPUT_DIR}\n")
 if __name__ == "__main__":

train/utils/__pycache__/preprocess.cpython-312.pyc ADDED Viewed

Binary file (782 Bytes). View file

train/utils/preprocess.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import re
+import unicodedata
+def preprocess_text(text):
+    text = unicodedata.normalize("NFKC", text)
+    text = re.sub(r"http\S+|www\.\S+", "", text)
+    text = re.sub(r"<.*?>", "", text)
+    text = re.sub(r"[^\wÀ-ÖØ-öø-ÿ?!,. ]", " ", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text