Upload folder using huggingface_hub

e78f911 verified 7 months ago

10 kB

	"""
	TinyByteCNN Model for Fiction vs Non-Fiction Classification
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import numpy as np
	import unicodedata
	import re
	from typing import Union, List


	class SE(nn.Module):
	"""Squeeze-Excitation module"""
	def __init__(self, c, r=8):
	super().__init__()
	m = max(c // r, 4)
	self.fc1 = nn.Linear(c, m)
	self.fc2 = nn.Linear(m, c)

	def forward(self, x):
	# x: [B, C, T]
	s = x.mean(dim=-1) # [B, C]
	s = F.silu(self.fc1(s))
	s = torch.sigmoid(self.fc2(s)) # [B, C]
	return x * s.unsqueeze(-1)


	class SepResBlock(nn.Module):
	"""Separable Residual Block with SE attention"""
	def __init__(self, c_in, c_out, k=7, stride=1, dilation=1, use_gn=False, se_ratio=8, drop=0.0):
	super().__init__()
	Norm = (lambda c: nn.GroupNorm(32, c)) if use_gn else nn.BatchNorm1d

	self.dw = nn.Conv1d(c_in, c_in, k, stride=stride, dilation=dilation,
	padding=((k-1)//2)*dilation, groups=c_in, bias=False)
	self.bn1 = Norm(c_in)
	self.pw = nn.Conv1d(c_in, c_out, 1, bias=False)
	self.bn2 = Norm(c_out)
	self.se = SE(c_out, se_ratio)
	self.drop = nn.Dropout(p=drop)

	self.proj = None
	if stride != 1 or c_in != c_out:
	self.proj = nn.Conv1d(c_in, c_out, 1, stride=stride, bias=False)

	def forward(self, x):
	y = self.dw(x)
	y = F.silu(self.bn1(y))
	y = self.pw(y)
	y = self.bn2(y)
	y = self.se(y)
	if self.proj is not None:
	x = self.proj(x)
	y = self.drop(y)
	return F.silu(x + y)


	class TinyByteCNN(nn.Module):
	"""TinyByteCNN for Fiction vs Non-Fiction Classification"""

	def __init__(self, config=None):
	super().__init__()

	# Default configuration
	if config is None:
	config = type('Config', (), {
	'vocab_size': 256,
	'embed_dim': 32,
	'widths': [128, 192, 256, 320],
	'use_gn': False,
	'head_drop': 0.1,
	'stochastic_depth': 0.05
	})()

	self.config = config

	# Embedding layer for bytes
	self.embed = nn.Embedding(config.vocab_size, config.embed_dim)

	# Stem convolution
	self.stem = nn.Conv1d(config.embed_dim, config.widths[0], 5, stride=2, padding=2, bias=False)
	self.bn0 = nn.BatchNorm1d(config.widths[0]) if not config.use_gn else nn.GroupNorm(32, config.widths[0])

	# Build stages
	cfg = [
	(2, config.widths[0], [1, 2]),
	(2, config.widths[1], [1, 2]),
	(3, config.widths[2], [1, 2, 4]),
	(3, config.widths[3], [1, 2, 8])
	]

	stages = []
	c_prev = config.widths[0]
	for blocks, c, ds in cfg:
	for i in range(blocks):
	stride = 2 if i == 0 else 1
	d = ds[i]
	stages.append(SepResBlock(c_prev, c, k=7, stride=stride, dilation=d,
	use_gn=config.use_gn, drop=config.stochastic_depth))
	c_prev = c

	self.stages = nn.Sequential(*stages)

	# Classification head
	self.head = nn.Sequential(
	nn.Dropout(p=config.head_drop),
	nn.Linear(2 * config.widths[-1], 1)
	)

	def forward(self, x_bytes):
	"""
	Args:
	x_bytes: [B, T] uint8 tensor of byte values
	Returns:
	logits: [B] tensor of binary classification logits
	"""
	x = self.embed(x_bytes.long()) # [B, T, E]
	x = x.transpose(1, 2).contiguous() # [B, E, T]
	x = F.silu(self.bn0(self.stem(x))) # [B, C0, T/2]
	x = self.stages(x) # [B, C, T/32]

	# Global pooling
	avg = x.mean(dim=-1)
	mx = x.amax(dim=-1)
	feats = torch.cat([avg, mx], dim=1)

	logits = self.head(feats).squeeze(1)
	return logits

	@classmethod
	def from_pretrained(cls, path_or_repo, use_safetensors=True):
	"""Load pretrained model (supports both .bin and .safetensors)"""
	import os
	from pathlib import Path

	# Determine if it's a file or directory/repo
	if os.path.isdir(path_or_repo):
	# Directory path - look for model files
	base_path = Path(path_or_repo)
	safetensors_path = base_path / "model.safetensors"
	pytorch_path = base_path / "pytorch_model.bin"

	if use_safetensors and safetensors_path.exists():
	# Load from safetensors
	from safetensors.torch import load_file
	state_dict = load_file(str(safetensors_path))

	# Load config if available
	config_path = base_path / "config.json"
	if config_path.exists():
	import json
	with open(config_path) as f:
	config_dict = json.load(f)
	config = type('Config', (), config_dict)()
	else:
	config = None

	model = cls(config)
	model.load_state_dict(state_dict)
	return model
	elif pytorch_path.exists():
	checkpoint = torch.load(pytorch_path, weights_only=False, map_location='cpu')
	elif os.path.isfile(path_or_repo):
	if path_or_repo.endswith('.safetensors'):
	from safetensors.torch import load_file
	state_dict = load_file(path_or_repo)
	model = cls()
	model.load_state_dict(state_dict)
	return model
	else:
	checkpoint = torch.load(path_or_repo, weights_only=False, map_location='cpu')
	else:
	# HuggingFace hub loading
	from huggingface_hub import hf_hub_download

	if use_safetensors:
	try:
	model_file = hf_hub_download(repo_id=path_or_repo, filename="model.safetensors")
	from safetensors.torch import load_file
	state_dict = load_file(model_file)
	model = cls()
	model.load_state_dict(state_dict)
	return model
	except:
	pass # Fall back to pytorch format

	model_file = hf_hub_download(repo_id=path_or_repo, filename="pytorch_model.bin")
	checkpoint = torch.load(model_file, weights_only=False, map_location='cpu')

	# Load from checkpoint (pytorch format)
	if 'checkpoint' in locals():
	config = checkpoint.get('config', None)
	model = cls(config)
	state_dict = checkpoint.get('model_state_dict', checkpoint)
	model.load_state_dict(state_dict)
	return model

	def save_pretrained(self, save_path):
	"""Save model to directory"""
	import os
	os.makedirs(save_path, exist_ok=True)

	torch.save({
	'model_state_dict': self.state_dict(),
	'config': self.config
	}, os.path.join(save_path, 'pytorch_model.bin'))


	def preprocess_text(text: str, max_len: int = 4096) -> torch.Tensor:
	"""
	Preprocess text to bytes for model input

	Args:
	text: Input text string
	max_len: Maximum sequence length (default 4096)

	Returns:
	Tensor of shape [1, max_len] containing byte values
	"""
	# Unicode NFC normalize
	text = unicodedata.normalize('NFC', text)

	# Replace \r\n → \n
	text = text.replace('\r\n', '\n')

	# Collapse runs of whitespace to at most 2
	text = re.sub(r'\s{3,}', ' ', text)

	# Convert to bytes
	text_bytes = text.encode('utf-8', errors='ignore')

	# Pad or truncate to max_len
	input_ids = np.zeros(max_len, dtype=np.uint8)
	input_ids[:min(len(text_bytes), max_len)] = list(text_bytes[:max_len])

	return torch.from_numpy(input_ids).unsqueeze(0) # Add batch dimension


	def classify_text(text: Union[str, List[str]], model=None, device='cpu'):
	"""
	Classify text as fiction or non-fiction

	Args:
	text: Single string or list of strings to classify
	model: Pre-loaded model (optional)
	device: Device to run on ('cpu', 'cuda', 'mps')

	Returns:
	Dictionary with predictions and confidence scores
	"""
	if model is None:
	model = TinyByteCNN.from_pretrained("fiction_classifier_hf")

	model = model.to(device)
	model.eval()

	# Handle single text or batch
	if isinstance(text, str):
	texts = [text]
	else:
	texts = text

	results = []

	for t in texts:
	input_ids = preprocess_text(t).to(device)

	with torch.no_grad():
	logits = model(input_ids)
	prob = torch.sigmoid(logits).item()

	pred_class = "Non-Fiction" if prob > 0.5 else "Fiction"
	confidence = prob if prob > 0.5 else (1 - prob)

	results.append({
	'text': t[:100] + '...' if len(t) > 100 else t,
	'prediction': pred_class,
	'confidence': confidence,
	'probability_nonfiction': prob
	})

	return results[0] if isinstance(text, str) else results


	if __name__ == "__main__":
	# Example usage
	sample_text = "The detective's coffee had gone cold hours ago, but she hardly noticed."

	# Load and use model
	model = TinyByteCNN.from_pretrained("fiction_model_output_cnn/best_model.pt")
	result = classify_text(sample_text, model)

	print(f"Text: {result['text']}")
	print(f"Prediction: {result['prediction']}")
	print(f"Confidence: {result['confidence']:.1%}")