Text_Authenticator / test_integration.py
satyaki-mitra's picture
Architecture updated
44d0409
# test_integration.py
import os
import sys
import json
from pathlib import Path
from io import StringIO
import contextlib
# Add the project root to Python path for imports
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
# Create a string buffer to capture output
output_buffer = StringIO()
with contextlib.redirect_stdout(output_buffer):
# Now import modules
from config.enums import ModelType, Domain, Language
from config.schemas import ModelConfig, ExtractedDocument, ProcessedText
from config.constants import document_extraction_params
from config.model_config import MODEL_REGISTRY, get_model_config
from config.settings import settings
from config.threshold_config import get_threshold_for_domain
print("=" * 70)
print("CONFIG MODULE INTEGRATION TEST")
print("=" * 70)
# Test 1: Enum usage
print(f"\nβœ“ Model Types: {[m.value for m in ModelType][:5]}...")
# Test 2: Schema instantiation
config = ModelConfig(
model_id="test",
model_type=ModelType.TRANSFORMER,
description="Test",
size_mb=100
)
print(f"βœ“ Schema instantiation: {config.model_id}")
# Test 3: Constants usage
print(f"βœ“ Max file size: {document_extraction_params.MAX_FILE_SIZE / 1024 / 1024:.1f} MB")
# Test 4: Model registry
print(f"βœ“ Available models: {list(MODEL_REGISTRY.keys())}")
# Test 5: Settings
print(f"βœ“ App name: {settings.APP_NAME}")
print(f"βœ“ Environment: {settings.ENVIRONMENT}")
print(f"βœ“ Log dir: {settings.LOG_DIR}")
print(f"βœ“ Model cache dir: {settings.MODEL_CACHE_DIR}")
# Test 6: Thresholds
thresholds = get_threshold_for_domain(Domain.ACADEMIC)
print(f"βœ“ Academic thresholds: {thresholds.ensemble_threshold}")
print("\n" + "=" * 70)
print("PROCESSORS MODULE INTEGRATION TEST")
print("=" * 70)
# Test 7: Document Extractor
try:
from processors.document_extractor import DocumentExtractor
# Create a test text file
test_text = "This is a test document for integration testing.\n" * 10
test_file = Path("test_document.txt")
# Write test file
test_file.write_text(test_text)
# Test extractor
extractor = DocumentExtractor(extract_metadata=True)
result = extractor.extract(str(test_file))
print(f"\nβœ“ Document Extractor Test:")
print(f" - Success: {result.is_success}")
print(f" - Text length: {len(result.text)} chars")
print(f" - File type: {result.file_type}")
print(f" - Method: {result.extraction_method}")
# Clean up test file
test_file.unlink()
except Exception as e:
print(f"\nβœ— Document Extractor failed: {e}")
# Test 8: Text Processor
try:
# First check if we have the needed constants
from config.constants import text_processing_params
print(f"\nβœ“ Text processing params available")
from processors.text_processor import TextProcessor
test_text = "This is a sample text for processing. It contains multiple sentences! " \
"Here is another sentence. And one more for testing."
processor = TextProcessor()
processed = processor.process(test_text)
print(f"\nβœ“ Text Processor Test:")
print(f" - Is valid: {processed.is_valid}")
print(f" - Words: {processed.word_count}")
print(f" - Sentences: {processed.sentence_count}")
print(f" - Avg sentence length: {processed.avg_sentence_length:.1f}")
print(f" - Avg word length: {processed.avg_word_length:.1f}")
except Exception as e:
print(f"\nβœ— Text Processor failed: {e}")
print(" Note: You need to add TextProcessingParams to constants.py")
# Test 9: Domain Classifier (without model)
try:
from processors.domain_classifier import DomainClassifier, get_domain_name, is_technical_domain
test_text = "This is a scientific paper about machine learning and artificial intelligence."
classifier = DomainClassifier()
print(f"\nβœ“ Domain Classifier initialized")
# Note: This will fail if models aren't loaded, but we can test the class structure
print(f" - Class structure verified")
print(f" - Domain enum available")
# Test helper functions
ai_ml_domain = Domain.AI_ML
print(f" - AI/ML domain name: {get_domain_name(ai_ml_domain)}")
print(f" - Is technical domain: {is_technical_domain(ai_ml_domain)}")
except Exception as e:
print(f"\nβœ— Domain Classifier setup failed: {e}")
# Test 10: Language Detector (heuristic mode)
try:
from processors.language_detector import LanguageDetector
# Test in English
english_text = "This is an English text for language detection testing."
# Use heuristic mode (no model dependency)
detector = LanguageDetector(use_model=False)
result = detector.detect(english_text)
print(f"\nβœ“ Language Detector Test (heuristic):")
print(f" - Primary language: {result.primary_language.value}")
print(f" - Evidence strength: {result.evidence_strength:.2f}")
print(f" - Method: {result.detection_method}")
print(f" - Script: {result.script.value}")
# Test language check
is_english = detector.is_language(english_text, Language.ENGLISH, threshold=0.5)
print(f" - Is English check: {is_english}")
except Exception as e:
print(f"\nβœ— Language Detector failed: {e}")
print("\n" + "=" * 70)
print("MODELS MODULE INTEGRATION TEST")
print("=" * 70)
# Test 11: Model Registry
try:
from models.model_registry import ModelRegistry, get_model_registry
registry = get_model_registry()
print(f"\nβœ“ Model Registry Test:")
print(f" - Singleton pattern working")
print(f" - Registry initialized")
# Test usage tracking
registry.record_model_usage("test_model", 1.5)
stats = registry.get_usage_stats("test_model")
print(f" - Usage tracking: {stats.usage_count if stats else 'N/A'}")
# Test dependency tracking
registry.add_dependency("model_b", ["model_a"])
deps = registry.get_dependencies("model_b")
print(f" - Dependency tracking: {deps}")
# Generate report
report = registry.generate_usage_report()
print(f" - Report generation: {len(report)} items")
# Test reset
registry.reset_usage_stats("test_model")
print(f" - Reset functionality working")
except Exception as e:
print(f"\nβœ— Model Registry failed: {e}")
# Test 12: Model Manager (without actual downloads)
try:
from models.model_manager import ModelManager, get_model_manager
manager = get_model_manager()
print(f"\nβœ“ Model Manager Test:")
print(f" - Singleton pattern working")
print(f" - Device: {manager.device}")
print(f" - Cache directory: {manager.cache_dir}")
# Test metadata
metadata = manager.metadata
print(f" - Metadata loaded: {len(metadata)} entries")
# Test cache
cache_size = manager.cache.size()
print(f" - Cache initialized: size {cache_size}")
# Test model info check
model_name = list(MODEL_REGISTRY.keys())[0] if MODEL_REGISTRY else "perplexity_reference_lm"
is_downloaded = manager.is_model_downloaded(model_name)
print(f" - Model check: {model_name} downloaded={is_downloaded}")
# Test memory usage
memory_info = manager.get_memory_usage()
print(f" - Memory monitoring: {len(memory_info)} metrics")
# Test model configuration access
model_config = get_model_config(model_name)
if model_config:
print(f" - Model config access: {model_config.model_id}")
except Exception as e:
print(f"\nβœ— Model Manager failed: {e}")
# Test 13: Integration between models and config
try:
print(f"\nβœ“ Config-Models Integration Test:")
# Check model config from registry
for model_name, config in MODEL_REGISTRY.items():
if config.required:
print(f" - {model_name}: {config.model_type.value}")
break
# Check settings integration
print(f" - Max cached models from settings: {settings.MAX_CACHED_MODELS}")
print(f" - Use quantization from settings: {settings.USE_QUANTIZATION}")
except Exception as e:
print(f"\nβœ— Config-Models integration failed: {e}")
# Test 14: End-to-End System Integration
try:
print(f"\n" + "=" * 70)
print("FULL SYSTEM INTEGRATION TEST")
print("=" * 70)
# Create a test scenario
sample_text = """
Machine learning is a subset of artificial intelligence.
It involves algorithms that learn patterns from data.
Deep learning uses neural networks with multiple layers.
"""
# 1. Process text
from processors.text_processor import TextProcessor
processor = TextProcessor()
processed = processor.process(sample_text)
print(f"βœ“ 1. Text Processing Complete:")
print(f" - Cleaned text: {len(processed.cleaned_text)} chars")
print(f" - Valid: {processed.is_valid}")
# 2. Detect language
from processors.language_detector import LanguageDetector
detector = LanguageDetector(use_model=False)
lang_result = detector.detect(processed.cleaned_text)
print(f"\nβœ“ 2. Language Detection Complete:")
print(f" - Language: {lang_result.primary_language.value}")
print(f" - Script: {lang_result.script.value}")
# 3. Domain classification structure
from processors.domain_classifier import get_domain_name, is_technical_domain
ai_ml_domain = Domain.AI_ML
print(f"\nβœ“ 3. Domain System Ready:")
print(f" - Domain enum: {ai_ml_domain.value}")
print(f" - Human name: {get_domain_name(ai_ml_domain)}")
print(f" - Is technical: {is_technical_domain(ai_ml_domain)}")
# 4. Model management
from models.model_manager import get_model_manager
from models.model_registry import get_model_registry
model_manager = get_model_manager()
model_registry = get_model_registry()
print(f"\nβœ“ 4. Model Management Ready:")
print(f" - Manager: {type(model_manager).__name__}")
print(f" - Registry: {type(model_registry).__name__}")
print(f" - Cache dir exists: {model_manager.cache_dir.exists()}")
# 5. Settings integration
print(f"\nβœ“ 5. Settings Integration:")
print(f" - App: {settings.APP_NAME} v{settings.APP_VERSION}")
print(f" - Environment: {settings.ENVIRONMENT}")
print(f" - Debug: {settings.DEBUG}")
print(f"\n🎯 FULL SYSTEM INTEGRATION SUCCESSFUL!")
except Exception as e:
print(f"\nβœ— Full system integration failed: {e}")
import traceback
print(traceback.format_exc())
print("\n" + "=" * 70)
print("TEST COMPLETED")
print("=" * 70)
# Get the captured output
output_text = output_buffer.getvalue()
# Print the output
print(output_text)
# Count successes and failures
success_count = sum(1 for line in output_text.split('\n') if 'βœ“' in line)
failure_count = sum(1 for line in output_text.split('\n') if 'βœ—' in line)
print(f"Successes: {success_count}")
print(f"Failures: {failure_count}")
if failure_count == 0:
print("\nπŸŽ‰ ALL TESTS PASSED! Complete system is properly integrated.")
else:
print(f"\n⚠️ {failure_count} tests failed. Check the issues above.")