Spaces:
Paused
Paused
| import os | |
| import tempfile | |
| import pytest | |
| from laser_encoders.download_models import LaserModelDownloader | |
| from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE | |
| from laser_encoders.laser_tokenizer import initialize_tokenizer | |
| from laser_encoders.models import initialize_encoder | |
| def test_validate_language_models_and_tokenize_laser3(lang): | |
| with tempfile.TemporaryDirectory() as tmp_dir: | |
| print(f"Created temporary directory for {lang}", tmp_dir) | |
| downloader = LaserModelDownloader(model_dir=tmp_dir) | |
| if lang in ["kashmiri", "kas", "central kanuri", "knc"]: | |
| with pytest.raises(ValueError) as excinfo: | |
| downloader.download_laser3(lang) | |
| assert "ValueError" in str(excinfo.value) | |
| print(f"{lang} language model raised a ValueError as expected.") | |
| else: | |
| downloader.download_laser3(lang) | |
| encoder = initialize_encoder(lang, model_dir=tmp_dir) | |
| tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir) | |
| # Test tokenization with a sample sentence | |
| tokenized = tokenizer.tokenize("This is a sample sentence.") | |
| print(f"{lang} model validated successfully") | |
| def test_validate_language_models_and_tokenize_laser2(lang): | |
| with tempfile.TemporaryDirectory() as tmp_dir: | |
| print(f"Created temporary directory for {lang}", tmp_dir) | |
| downloader = LaserModelDownloader(model_dir=tmp_dir) | |
| downloader.download_laser2() | |
| encoder = initialize_encoder(lang, model_dir=tmp_dir) | |
| tokenizer = initialize_tokenizer(lang, model_dir=tmp_dir) | |
| # Test tokenization with a sample sentence | |
| tokenized = tokenizer.tokenize("This is a sample sentence.") | |
| print(f"{lang} model validated successfully") | |
| class MockLaserModelDownloader(LaserModelDownloader): | |
| def __init__(self, model_dir): | |
| self.model_dir = model_dir | |
| def download_laser3(self, lang): | |
| lang = self.get_language_code(LASER3_LANGUAGE, lang) | |
| file_path = os.path.join(self.model_dir, f"laser3-{lang}.v1.pt") | |
| if not os.path.exists(file_path): | |
| raise FileNotFoundError(f"Could not find {file_path}.") | |
| def download_laser2(self): | |
| files = ["laser2.pt", "laser2.spm", "laser2.cvocab"] | |
| for file_name in files: | |
| file_path = os.path.join(self.model_dir, file_name) | |
| if not os.path.exists(file_path): | |
| raise FileNotFoundError(f"Could not find {file_path}.") | |
| CACHE_DIR = "/home/user/.cache/models" # Change this to the desired cache directory | |
| # This uses the mock downloader | |
| def test_validate_language_models_and_tokenize_mock_laser3(lang): | |
| downloader = MockLaserModelDownloader(model_dir=CACHE_DIR) | |
| try: | |
| downloader.download_laser3(lang) | |
| except FileNotFoundError as e: | |
| raise pytest.error(str(e)) | |
| encoder = initialize_encoder(lang, model_dir=CACHE_DIR) | |
| tokenizer = initialize_tokenizer(lang, model_dir=CACHE_DIR) | |
| tokenized = tokenizer.tokenize("This is a sample sentence.") | |
| print(f"{lang} model validated successfully") | |
| # This uses the mock downloader | |
| def test_validate_language_models_and_tokenize_mock_laser2(lang): | |
| downloader = MockLaserModelDownloader(model_dir=CACHE_DIR) | |
| try: | |
| downloader.download_laser2() | |
| except FileNotFoundError as e: | |
| raise pytest.error(str(e)) | |
| encoder = initialize_encoder(lang, model_dir=CACHE_DIR) | |
| tokenizer = initialize_tokenizer(lang, model_dir=CACHE_DIR) | |
| tokenized = tokenizer.tokenize("This is a sample sentence.") | |
| print(f"{lang} model validated successfully") | |