Zevir commited on
Commit
da8ca36
·
1 Parent(s): b3f6538

new dataset

Browse files
app/{app.py → main.py} RENAMED
File without changes
app/model/config.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc5dd73b5b7f525c447c56cf92780ba98e78c95c7dfaff7e92cb0d5531d0b82f
3
- size 866
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39d172162dc11165b0407c9bf149e7a59099ff4e9c1b28183bdd07b27ff95944
3
+ size 898
app/model/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:27642721c62ef81f6961c3051c3246035166f90d888b853f4e91e2ca8ae3c460
3
  size 435722224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83503a45ba59d26dadc5ce8961d0e4ee4948be9bb82d0cf23c47a33ee3264e31
3
  size 435722224
app/model/special_tokens_map.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6d346be366a7d1d48332dbc9fdf3bf8960b5d879522b7799ddba59e76237ee3
3
- size 125
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c3507f36dff57bce437223db3b3081d1e2b52ec3e56ee55438193ecb2c94dd6
3
+ size 132
app/model/tokenizer_config.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ab044e4a71cdb2a5cff548e16d3bcd46a757848ef861743426c44a134b00da1
3
- size 1301
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7525d84e2d59af28db08243599eb1da698db5f415be9fc1f5a41332d27be405b
3
+ size 1359
train/__init__.py ADDED
File without changes
train/evaluate_model.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from torch.utils.data import DataLoader, Dataset
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
5
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
6
+
7
+ from utils.preprocess import preprocess_text
8
+
9
+ FAKE_DIR = "data/fake_news/financeiros"
10
+ REAL_DIR = "data/real_news/financeiros"
11
+ MODEL_DIR = "app/model"
12
+ MAX_LEN = 256
13
+ BATCH_SIZE = 8
14
+
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+
17
+ # ========= LOAD DATA =========
18
+ def load_texts(directory, label):
19
+ samples = []
20
+ for root, _, files in os.walk(directory):
21
+ for fname in files:
22
+ if fname.endswith(".txt"):
23
+ path = os.path.join(root, fname)
24
+ with open(path, "r", encoding="utf-8") as f:
25
+ text = preprocess_text(f.read())
26
+ samples.append((text, label))
27
+ return samples
28
+
29
+ def load_dataset():
30
+ fake = load_texts(FAKE_DIR, 0)
31
+ real = load_texts(REAL_DIR, 1)
32
+ data = fake + real
33
+ texts = [t for t, _ in data]
34
+ labels = [l for _, l in data]
35
+ return texts, labels
36
+
37
+ # ========= DATASET =========
38
+ class NewsDataset(Dataset):
39
+ def __init__(self, texts, labels, tokenizer):
40
+ self.texts = texts
41
+ self.labels = labels
42
+ self.tok = tokenizer
43
+
44
+ def __len__(self):
45
+ return len(self.texts)
46
+
47
+ def __getitem__(self, idx):
48
+ enc = self.tok(
49
+ self.texts[idx],
50
+ truncation=True,
51
+ padding="max_length",
52
+ max_length=MAX_LEN,
53
+ return_tensors="pt"
54
+ )
55
+ enc = {k: v.squeeze() for k, v in enc.items()}
56
+ enc["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
57
+ return enc
58
+
59
+ # ========= EVALUATION =========
60
+ def evaluate():
61
+ print("Carregando modelo...")
62
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
63
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to(device)
64
+
65
+ print("Carregando dataset...")
66
+ texts, labels = load_dataset()
67
+ dataset = NewsDataset(texts, labels, tokenizer)
68
+ loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)
69
+
70
+ model.eval()
71
+ preds = []
72
+ true_labels = []
73
+
74
+ print("\nAvaliando...\n")
75
+
76
+ with torch.no_grad():
77
+ for batch in loader:
78
+ batch = {k: v.to(device) for k, v in batch.items()}
79
+ outputs = model(**batch)
80
+ p = torch.argmax(outputs.logits, dim=1).cpu().numpy()
81
+ l = batch["labels"].cpu().numpy()
82
+
83
+ preds.extend(p)
84
+ true_labels.extend(l)
85
+
86
+ # === METRICS ===
87
+ acc = accuracy_score(true_labels, preds)
88
+ print(f"Accuracy: {acc:.4f}")
89
+
90
+ print("\nClassification Report:")
91
+ print(classification_report(true_labels, preds, target_names=["Fake", "Real"]))
92
+
93
+ print("\nConfusion Matrix:")
94
+ print(confusion_matrix(true_labels, preds))
95
+
96
+
97
+ if __name__ == "__main__":
98
+ evaluate()
{app → train}/train_finetune.py RENAMED
@@ -1,20 +1,23 @@
1
  import os
2
  import torch
3
  from torch.utils.data import DataLoader, Dataset
4
- from torch.optim import AdamW
5
  from transformers import (
6
  AutoTokenizer,
7
  AutoModelForSequenceClassification,
8
  get_linear_schedule_with_warmup
9
  )
 
 
10
  from tqdm import tqdm
 
11
  import random
12
 
13
- from app.preprocess import preprocess_text
14
 
15
- # CONFIGURAÇÕES
16
  MODEL_NAME = "neuralmind/bert-base-portuguese-cased"
17
  OUTPUT_DIR = "app/model"
 
18
  FAKE_DIR = "data/fake_news/financeiros"
19
  REAL_DIR = "data/real_news/financeiros"
20
 
@@ -24,11 +27,9 @@ LR = 2e-5
24
  MAX_LEN = 256
25
 
26
  device = "cuda" if torch.cuda.is_available() else "cpu"
27
- print(f"🔥 Treinando em: {device}")
28
 
29
- # FUNÇÕES AUXILIARES
30
  def load_texts_from_dir(directory, label):
31
- """Lê recursivamente todos os .txt em todas as subpastas."""
32
  samples = []
33
 
34
  for root, _, files in os.walk(directory):
@@ -37,32 +38,26 @@ def load_texts_from_dir(directory, label):
37
  path = os.path.join(root, fname)
38
  try:
39
  with open(path, "r", encoding="utf-8") as f:
40
- text = f.read()
41
- text = preprocess_text(text)
42
  samples.append((text, label))
43
  except Exception as e:
44
- print(f"Erro ao ler {path}: {e}")
45
 
46
  return samples
47
 
48
 
49
  def load_dataset():
50
- """Carrega fake e real em formato único."""
51
- print("📂 Carregando dados das pastas...")
52
  fake = load_texts_from_dir(FAKE_DIR, 0)
53
  real = load_texts_from_dir(REAL_DIR, 1)
54
 
55
- dataset = fake + real
56
- random.shuffle(dataset)
57
 
58
- print(f"✔ Total Fake: {len(fake)}")
59
- print(f"✔ Total Real: {len(real)}")
60
- print(f"✔ Total: {len(dataset)}")
61
 
62
- texts, labels = zip(*dataset)
63
  return list(texts), list(labels)
64
 
65
- # DATASET DO TORCH
66
  class NewsDataset(Dataset):
67
  def __init__(self, texts, labels, tokenizer):
68
  self.texts = texts
@@ -84,47 +79,88 @@ class NewsDataset(Dataset):
84
  encoded["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
85
  return encoded
86
 
87
- # PROCESSO DE TREINAMENTO
88
  def train():
89
  texts, labels = load_dataset()
90
 
 
 
 
 
 
 
 
 
 
 
 
91
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
92
- model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)
 
 
93
 
94
- dataset = NewsDataset(texts, labels, tokenizer)
95
- loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
 
 
 
 
 
 
96
 
97
  optimizer = AdamW(model.parameters(), lr=LR)
98
- total_steps = len(loader) * EPOCHS
99
  scheduler = get_linear_schedule_with_warmup(optimizer, 0, total_steps)
100
 
101
- print("\n🚀 Iniciando fine-tuning do BERT...\n")
102
 
103
  model.train()
104
 
105
  for epoch in range(EPOCHS):
106
- print(f"\n===== Época {epoch+1}/{EPOCHS} =====")
107
  epoch_loss = 0
108
 
109
- for batch in tqdm(loader):
110
  batch = {k: v.to(device) for k, v in batch.items()}
111
-
112
  outputs = model(**batch)
113
  loss = outputs.loss
114
- epoch_loss += loss.item()
115
 
 
116
  loss.backward()
 
117
  optimizer.step()
118
  scheduler.step()
119
  optimizer.zero_grad()
120
 
121
- print(f"📉 Loss da época: {epoch_loss / len(loader):.4f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
 
123
  os.makedirs(OUTPUT_DIR, exist_ok=True)
124
  model.save_pretrained(OUTPUT_DIR)
125
  tokenizer.save_pretrained(OUTPUT_DIR)
126
 
127
- print(f"\n🎉 Modelo salvo em: {OUTPUT_DIR}\n")
128
 
129
 
130
  if __name__ == "__main__":
 
1
  import os
2
  import torch
3
  from torch.utils.data import DataLoader, Dataset
4
+ from torch.optim import AdamW
5
  from transformers import (
6
  AutoTokenizer,
7
  AutoModelForSequenceClassification,
8
  get_linear_schedule_with_warmup
9
  )
10
+ from sklearn.model_selection import train_test_split
11
+ from sklearn.metrics import classification_report, accuracy_score
12
  from tqdm import tqdm
13
+ import numpy as np
14
  import random
15
 
16
+ from utils.preprocess import preprocess_text
17
 
 
18
  MODEL_NAME = "neuralmind/bert-base-portuguese-cased"
19
  OUTPUT_DIR = "app/model"
20
+
21
  FAKE_DIR = "data/fake_news/financeiros"
22
  REAL_DIR = "data/real_news/financeiros"
23
 
 
27
  MAX_LEN = 256
28
 
29
  device = "cuda" if torch.cuda.is_available() else "cpu"
30
+ print(f"Treinando em: {device}")
31
 
 
32
  def load_texts_from_dir(directory, label):
 
33
  samples = []
34
 
35
  for root, _, files in os.walk(directory):
 
38
  path = os.path.join(root, fname)
39
  try:
40
  with open(path, "r", encoding="utf-8") as f:
41
+ text = preprocess_text(f.read())
 
42
  samples.append((text, label))
43
  except Exception as e:
44
+ print(f"Erro ao ler {path}: {e}")
45
 
46
  return samples
47
 
48
 
49
  def load_dataset():
 
 
50
  fake = load_texts_from_dir(FAKE_DIR, 0)
51
  real = load_texts_from_dir(REAL_DIR, 1)
52
 
53
+ all_data = fake + real
54
+ random.shuffle(all_data)
55
 
56
+ print(f"Fake: {len(fake)} | Real: {len(real)} | Total: {len(all_data)}")
 
 
57
 
58
+ texts, labels = zip(*all_data)
59
  return list(texts), list(labels)
60
 
 
61
  class NewsDataset(Dataset):
62
  def __init__(self, texts, labels, tokenizer):
63
  self.texts = texts
 
79
  encoded["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
80
  return encoded
81
 
 
82
  def train():
83
  texts, labels = load_dataset()
84
 
85
+ # SEPARAÇÃO REAL entre treino, validação e teste
86
+ X_train, X_test, y_train, y_test = train_test_split(
87
+ texts, labels, test_size=0.20, stratify=labels, random_state=42
88
+ )
89
+
90
+ X_train, X_val, y_train, y_val = train_test_split(
91
+ X_train, y_train, test_size=0.10, stratify=y_train, random_state=42
92
+ )
93
+
94
+ print(f"\nTreino: {len(X_train)} | Val: {len(X_val)} | Teste: {len(X_test)}\n")
95
+
96
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
97
+ model = AutoModelForSequenceClassification.from_pretrained(
98
+ MODEL_NAME, num_labels=2
99
+ ).to(device)
100
 
101
+ # LOADERS
102
+ train_dataset = NewsDataset(X_train, y_train, tokenizer)
103
+ val_dataset = NewsDataset(X_val, y_val, tokenizer)
104
+ test_dataset = NewsDataset(X_test, y_test, tokenizer)
105
+
106
+ train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
107
+ val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
108
+ test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
109
 
110
  optimizer = AdamW(model.parameters(), lr=LR)
111
+ total_steps = len(train_loader) * EPOCHS
112
  scheduler = get_linear_schedule_with_warmup(optimizer, 0, total_steps)
113
 
114
+ print("Iniciando fine-tuning...\n")
115
 
116
  model.train()
117
 
118
  for epoch in range(EPOCHS):
119
+ print(f"=== Época {epoch+1}/{EPOCHS} ===")
120
  epoch_loss = 0
121
 
122
+ for batch in tqdm(train_loader):
123
  batch = {k: v.to(device) for k, v in batch.items()}
 
124
  outputs = model(**batch)
125
  loss = outputs.loss
 
126
 
127
+ epoch_loss += loss.item()
128
  loss.backward()
129
+
130
  optimizer.step()
131
  scheduler.step()
132
  optimizer.zero_grad()
133
 
134
+ print(f"Loss da época: {epoch_loss / len(train_loader):.4f}")
135
+
136
+ print("\nAvaliando...")
137
+
138
+ model.eval()
139
+ all_preds = []
140
+ all_true = []
141
+
142
+ with torch.no_grad():
143
+ for batch in tqdm(test_loader):
144
+ labels = batch["labels"].numpy()
145
+ inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
146
+
147
+ outputs = model(**inputs)
148
+ preds = outputs.logits.argmax(dim=1).cpu().numpy()
149
+
150
+ all_preds.extend(preds)
151
+ all_true.extend(labels)
152
+
153
+ # MÉTRICAS
154
+ print("\n=== Classification Report ===")
155
+ print(classification_report(all_true, all_preds, target_names=["Fake", "Real"]))
156
+ print("Accuracy:", accuracy_score(all_true, all_preds))
157
 
158
+ # SALVAR MODELO
159
  os.makedirs(OUTPUT_DIR, exist_ok=True)
160
  model.save_pretrained(OUTPUT_DIR)
161
  tokenizer.save_pretrained(OUTPUT_DIR)
162
 
163
+ print(f"\nModelo salvo em: {OUTPUT_DIR}\n")
164
 
165
 
166
  if __name__ == "__main__":
train/utils/__pycache__/preprocess.cpython-312.pyc ADDED
Binary file (782 Bytes). View file
 
train/utils/preprocess.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import unicodedata
3
+
4
+ def preprocess_text(text):
5
+ text = unicodedata.normalize("NFKC", text)
6
+ text = re.sub(r"http\S+|www\.\S+", "", text)
7
+ text = re.sub(r"<.*?>", "", text)
8
+ text = re.sub(r"[^\wÀ-ÖØ-öø-ÿ?!,. ]", " ", text)
9
+ text = re.sub(r"\s+", " ", text).strip()
10
+ return text
11
+