feat: Sync training infrastructure from main repository
Browse files- requirements.txt +21 -38
- training/model.py +16 -4
requirements.txt
CHANGED
|
@@ -1,40 +1,23 @@
|
|
| 1 |
-
#
|
| 2 |
-
#
|
| 3 |
-
|
| 4 |
-
# Hugging Face Hub for authentication and model upload
|
| 5 |
-
huggingface_hub>=0.19.0
|
| 6 |
-
|
| 7 |
-
# Gradio for web interface (latest stable version with security fixes)
|
| 8 |
-
gradio>=5.31.0
|
| 9 |
-
|
| 10 |
-
# PyTorch for model training
|
| 11 |
torch>=2.0.0
|
| 12 |
-
torchvision>=0.15.0
|
| 13 |
-
|
| 14 |
-
# Transformers for model handling
|
| 15 |
-
transformers>=4.35.0
|
| 16 |
-
|
| 17 |
-
# SentencePiece for tokenization
|
| 18 |
-
sentencepiece>=0.1.99
|
| 19 |
-
|
| 20 |
-
# NumPy and other utilities
|
| 21 |
-
numpy>=1.24.0
|
| 22 |
-
pandas>=2.0.0
|
| 23 |
-
|
| 24 |
-
# Additional utilities
|
| 25 |
-
requests>=2.31.0
|
| 26 |
-
tqdm>=4.65.0
|
| 27 |
-
|
| 28 |
-
# Testing dependencies
|
| 29 |
-
pytest>=7.0.0
|
| 30 |
-
pytest-cov>=4.0.0
|
| 31 |
-
|
| 32 |
-
# Development dependencies
|
| 33 |
-
black>=23.0.0
|
| 34 |
-
isort>=5.12.0
|
| 35 |
-
bandit>=1.7.7
|
| 36 |
-
safety>=2.3.0
|
| 37 |
|
| 38 |
-
#
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core Machine Learning Dependencies
|
| 2 |
+
# PyTorch - Deep learning framework for model training and inference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
torch>=2.0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
+
# Hugging Face Ecosystem - Model loading, training, and tokenization
|
| 6 |
+
transformers>=4.30.0 # Pre-trained models and training utilities
|
| 7 |
+
datasets>=2.12.0 # Dataset loading and processing
|
| 8 |
+
tokenizers>=0.13.0 # Fast tokenization library
|
| 9 |
+
sentencepiece>=0.1.99 # SentencePiece tokenization
|
| 10 |
+
huggingface_hub>=0.34.0 # Hugging Face Hub integration
|
| 11 |
+
accelerate>=0.20.0 # Distributed training acceleration
|
| 12 |
+
|
| 13 |
+
# User Interface - Gradio for web-based training interface
|
| 14 |
+
gradio>=4.0.0 # Web UI framework for ML applications
|
| 15 |
+
|
| 16 |
+
# Data Processing and Utilities
|
| 17 |
+
numpy>=1.24.0 # Numerical computing library
|
| 18 |
+
pandas>=2.0.0 # Data manipulation and analysis
|
| 19 |
+
tqdm>=4.65.0 # Progress bars for long-running operations
|
| 20 |
+
psutil>=5.9.0 # System and process utilities
|
| 21 |
+
|
| 22 |
+
# Note: These versions are compatible with Hugging Face Spaces
|
| 23 |
+
# and provide stable training performance
|
training/model.py
CHANGED
|
@@ -414,12 +414,13 @@ class GPTModel(nn.Module):
|
|
| 414 |
- Text generation (inference)
|
| 415 |
"""
|
| 416 |
|
| 417 |
-
def __init__(self, config: GPTConfig):
|
| 418 |
super().__init__()
|
| 419 |
assert config.vocab_size is not None, "vocab_size must be specified"
|
| 420 |
assert config.block_size is not None, "block_size must be specified"
|
| 421 |
|
| 422 |
self.config = config
|
|
|
|
| 423 |
|
| 424 |
# Embeddings
|
| 425 |
self.transformer = nn.ModuleDict(
|
|
@@ -504,9 +505,20 @@ class GPTModel(nn.Module):
|
|
| 504 |
# Combine embeddings and apply dropout
|
| 505 |
x = self.transformer.drop(tok_emb + pos_emb)
|
| 506 |
|
| 507 |
-
# Pass through transformer blocks
|
| 508 |
-
|
| 509 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 510 |
|
| 511 |
# Final layer normalization
|
| 512 |
x = self.transformer.ln_f(x)
|
|
|
|
| 414 |
- Text generation (inference)
|
| 415 |
"""
|
| 416 |
|
| 417 |
+
def __init__(self, config: GPTConfig, use_checkpoint=True):
|
| 418 |
super().__init__()
|
| 419 |
assert config.vocab_size is not None, "vocab_size must be specified"
|
| 420 |
assert config.block_size is not None, "block_size must be specified"
|
| 421 |
|
| 422 |
self.config = config
|
| 423 |
+
self.use_checkpoint = use_checkpoint
|
| 424 |
|
| 425 |
# Embeddings
|
| 426 |
self.transformer = nn.ModuleDict(
|
|
|
|
| 505 |
# Combine embeddings and apply dropout
|
| 506 |
x = self.transformer.drop(tok_emb + pos_emb)
|
| 507 |
|
| 508 |
+
# Pass through transformer blocks with optional gradient checkpointing
|
| 509 |
+
if self.use_checkpoint and self.training:
|
| 510 |
+
# Use gradient checkpointing to save memory during training
|
| 511 |
+
try:
|
| 512 |
+
for block in self.transformer.h:
|
| 513 |
+
x = torch.utils.checkpoint.checkpoint(block, x)
|
| 514 |
+
except AttributeError:
|
| 515 |
+
# Fallback for older PyTorch versions
|
| 516 |
+
for block in self.transformer.h:
|
| 517 |
+
x = block(x)
|
| 518 |
+
else:
|
| 519 |
+
# Standard forward pass
|
| 520 |
+
for block in self.transformer.h:
|
| 521 |
+
x = block(x)
|
| 522 |
|
| 523 |
# Final layer normalization
|
| 524 |
x = self.transformer.ln_f(x)
|