Update app.py
Browse files
app.py
CHANGED
|
@@ -43,7 +43,8 @@ models = ["Helsinki-NLP", "QUICKMT", "Argos", "Lego-MT/Lego-MT", "HPLT", "HPLT-O
|
|
| 43 |
"bigscience/bloomz-560m", "bigscience/bloomz-1b1", "bigscience/bloomz-1b7", "bigscience/bloomz-3b",
|
| 44 |
"t5-small", "t5-base", "t5-large",
|
| 45 |
"google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl",
|
| 46 |
-
"google/madlad400-3b-mt", "jbochi/madlad400-3b-mt",
|
|
|
|
| 47 |
"utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct",
|
| 48 |
"Unbabel/Tower-Plus-2B", "Unbabel/TowerInstruct-7B-v0.2", "Unbabel/TowerInstruct-Mistral-7B-v0.2",
|
| 49 |
"HuggingFaceTB/SmolLM3-3B", "winninghealth/WiNGPT-Babel-2",
|
|
@@ -264,6 +265,29 @@ class Translators:
|
|
| 264 |
translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl)
|
| 265 |
translated_text = translator(text, max_length=512)
|
| 266 |
return translated_text[0]['translation_text']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
def smollm(self):
|
| 269 |
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
|
@@ -568,6 +592,9 @@ def translate_text(model_name: str, s_language: str, t_language: str, input_text
|
|
| 568 |
|
| 569 |
elif 'madlad' in model_name.lower():
|
| 570 |
translated_text = Translators(model_name, sl, tl, input_text).madlad()
|
|
|
|
|
|
|
|
|
|
| 571 |
|
| 572 |
elif 'mt0' in model_name.lower():
|
| 573 |
translated_text = Translators(model_name, s_language, t_language, input_text).bigscience()
|
|
|
|
| 43 |
"bigscience/bloomz-560m", "bigscience/bloomz-1b1", "bigscience/bloomz-1b7", "bigscience/bloomz-3b",
|
| 44 |
"t5-small", "t5-base", "t5-large",
|
| 45 |
"google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl",
|
| 46 |
+
"google/madlad400-3b-mt", "jbochi/madlad400-3b-mt",
|
| 47 |
+
"Heng666/madlad400-3b-mt-ct2-int8", "Heng666/madlad400-7b-mt-ct2-int8",
|
| 48 |
"utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct",
|
| 49 |
"Unbabel/Tower-Plus-2B", "Unbabel/TowerInstruct-7B-v0.2", "Unbabel/TowerInstruct-Mistral-7B-v0.2",
|
| 50 |
"HuggingFaceTB/SmolLM3-3B", "winninghealth/WiNGPT-Babel-2",
|
|
|
|
| 265 |
translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl)
|
| 266 |
translated_text = translator(text, max_length=512)
|
| 267 |
return translated_text[0]['translation_text']
|
| 268 |
+
|
| 269 |
+
def madladct2(self):
|
| 270 |
+
import ctranslate2
|
| 271 |
+
from sentencepiece import SentencePieceProcessor
|
| 272 |
+
from huggingface_hub import snapshot_download
|
| 273 |
+
|
| 274 |
+
model_path = snapshot_download(self.model_name)
|
| 275 |
+
|
| 276 |
+
tokenizer = SentencePieceProcessor()
|
| 277 |
+
tokenizer.load(f"{model_path}/sentencepiece.model")
|
| 278 |
+
translator = ctranslate2.Translator(model_path)
|
| 279 |
+
|
| 280 |
+
input_tokens = tokenizer.encode(f"<2{self.tl}> {self.input_text}", out_type=str)
|
| 281 |
+
results = translator.translate_batch(
|
| 282 |
+
[input_tokens],
|
| 283 |
+
batch_type="tokens",
|
| 284 |
+
max_batch_size=512,
|
| 285 |
+
beam_size=1,
|
| 286 |
+
no_repeat_ngram_size=1,
|
| 287 |
+
repetition_penalty=2,
|
| 288 |
+
)
|
| 289 |
+
translated_sentence = tokenizer.decode(results[0].hypotheses[0])
|
| 290 |
+
return translated_sentence)
|
| 291 |
|
| 292 |
def smollm(self):
|
| 293 |
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
|
|
|
| 592 |
|
| 593 |
elif 'madlad' in model_name.lower():
|
| 594 |
translated_text = Translators(model_name, sl, tl, input_text).madlad()
|
| 595 |
+
|
| 596 |
+
elif 'mt-ct2-int8' in model_name.lower():
|
| 597 |
+
translated_text = Translators(model_name, sl, tl, input_text).madladct2()
|
| 598 |
|
| 599 |
elif 'mt0' in model_name.lower():
|
| 600 |
translated_text = Translators(model_name, s_language, t_language, input_text).bigscience()
|