Update app.py
Browse files
app.py
CHANGED
|
@@ -13,15 +13,15 @@ df = pl.read_parquet("isolanguages.parquet")
|
|
| 13 |
non_empty_isos = df.slice(1).filter(pl.col("ISO639-1") != "").rows()
|
| 14 |
# all_langs = languagecodes.iso_languages_byname
|
| 15 |
all_langs = {iso[0]: (iso[1], iso[2], iso[3]) for iso in non_empty_isos} # {'Romanian': ('ro', 'rum', 'ron')}
|
|
|
|
|
|
|
| 16 |
# iso1_to_name = {codes[0]: lang for entry in all_langs for lang, codes in entry.items()} # {'ro': 'Romanian', 'de': 'German'}
|
| 17 |
iso1_to_name = {iso[1]: iso[0] for iso in non_empty_isos} # {'ro': 'Romanian', 'de': 'German'}
|
| 18 |
langs = {iso[0]: iso[1] for iso in non_empty_isos} # {'Romanian': 'ro', 'German': 'de'}
|
| 19 |
-
# langs = {"English": "en", "Romanian": "ro", "German": "de", "French": "fr", "Spanish": "es", "Italian": "it"}
|
| 20 |
-
# langs = list(favourite_langs.keys())
|
| 21 |
-
# langs.extend(list(all_langs.keys())) # Language options as list, add favourite languages first
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
|
|
|
| 25 |
allmodels = ["Helsinki-NLP",
|
| 26 |
"Helsinki-NLP/opus-mt-tc-bible-big-mul-mul", "Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_nld",
|
| 27 |
"Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-mul",
|
|
@@ -49,9 +49,9 @@ class Translators:
|
|
| 49 |
|
| 50 |
def google(self):
|
| 51 |
url = os.environ['GCLIENT'] + f'sl={self.sl}&tl={self.tl}&q={self.input_text}'
|
| 52 |
-
response =
|
| 53 |
return response.json()[0][0][0]
|
| 54 |
-
|
| 55 |
@classmethod
|
| 56 |
def download_argos_model(cls, from_code, to_code):
|
| 57 |
import argostranslate.package
|
|
@@ -241,11 +241,11 @@ class Translators:
|
|
| 241 |
translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl)
|
| 242 |
translated_text = translator(self.input_text, max_length=512)
|
| 243 |
return translated_text[0]['translation_text']
|
| 244 |
-
|
| 245 |
def wingpt(self):
|
| 246 |
model = AutoModelForCausalLM.from_pretrained(
|
| 247 |
self.model_name,
|
| 248 |
-
|
| 249 |
device_map="auto"
|
| 250 |
)
|
| 251 |
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
|
@@ -377,40 +377,6 @@ def download_argos_model(from_code, to_code):
|
|
| 377 |
)
|
| 378 |
argostranslate.package.install_from_path(package_to_install.download())
|
| 379 |
|
| 380 |
-
def wingpt(model_name, sl, tl, input_text):
|
| 381 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 382 |
-
model_name,
|
| 383 |
-
torch_dtype="auto",
|
| 384 |
-
device_map="auto"
|
| 385 |
-
)
|
| 386 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 387 |
-
input_json = '{"input_text": input_text}'
|
| 388 |
-
messages = [
|
| 389 |
-
{"role": "system", "content": f"Translate this to {tl} language"},
|
| 390 |
-
{"role": "user", "content": input_text}
|
| 391 |
-
]
|
| 392 |
-
|
| 393 |
-
text = tokenizer.apply_chat_template(
|
| 394 |
-
messages,
|
| 395 |
-
tokenize=False,
|
| 396 |
-
add_generation_prompt=True
|
| 397 |
-
)
|
| 398 |
-
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
| 399 |
-
|
| 400 |
-
generated_ids = model.generate(
|
| 401 |
-
**model_inputs,
|
| 402 |
-
max_new_tokens=512,
|
| 403 |
-
temperature=0.1
|
| 404 |
-
)
|
| 405 |
-
|
| 406 |
-
generated_ids = [
|
| 407 |
-
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
| 408 |
-
]
|
| 409 |
-
print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True))
|
| 410 |
-
rawresult = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
| 411 |
-
result = rawresult.split('\n')[-1].strip() if '\n' in rawresult else rawresult.strip()
|
| 412 |
-
return result
|
| 413 |
-
|
| 414 |
def translate_text(model_name: str, s_language: str, t_language: str, input_text: str) -> tuple[str, str]:
|
| 415 |
"""
|
| 416 |
Translates the input text from the source language to the target language using a specified model.
|
|
@@ -565,7 +531,7 @@ with st.container(border=None, width="stretch", height="content", horizontal=Fal
|
|
| 565 |
# Handle the submit button click
|
| 566 |
if submit_button:
|
| 567 |
with st.spinner("Translating...", show_time=True):
|
| 568 |
-
translated_text, message = translate_text(model_name, sselected_language,
|
| 569 |
# if model_name.startswith('Helsinki-NLP'):
|
| 570 |
# # input_ids = tokenizer.encode(input_text, return_tensors='pt')
|
| 571 |
# # # Perform translation
|
|
@@ -586,11 +552,6 @@ if submit_button:
|
|
| 586 |
# translation = pipe(input_text)
|
| 587 |
# translated_text = translation[0]['translation_text']
|
| 588 |
|
| 589 |
-
# elif model_name.startswith('Google'):
|
| 590 |
-
# url = os.environ['GCLIENT'] + f'sl={sl}&tl={tl}&q={input_text}'
|
| 591 |
-
# response = httpx.get(url)
|
| 592 |
-
# translated_text = response.json()[0][0][0]
|
| 593 |
-
# print(response.json()[0][0])
|
| 594 |
# elif model_name.startswith('t5'):
|
| 595 |
# tokenizer = T5Tokenizer.from_pretrained(model_name)
|
| 596 |
# model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
|
|
@@ -622,8 +583,6 @@ if submit_button:
|
|
| 622 |
# translated_text = f"No Argos model for {sselected_language} to {tselected_language}. Try other model or languages combination!"
|
| 623 |
# except Exception as error:
|
| 624 |
# translated_text = error
|
| 625 |
-
# elif model_name == "winninghealth/WiNGPT-Babel-2":
|
| 626 |
-
# translated_text = wingpt(model_name, sselected_language, tselected_language, input_text)
|
| 627 |
|
| 628 |
# Display the translated text
|
| 629 |
print(f"Translated from {sselected_language} to {tselected_language} using {model_name}.", input_text, translated_text)
|
|
|
|
| 13 |
non_empty_isos = df.slice(1).filter(pl.col("ISO639-1") != "").rows()
|
| 14 |
# all_langs = languagecodes.iso_languages_byname
|
| 15 |
all_langs = {iso[0]: (iso[1], iso[2], iso[3]) for iso in non_empty_isos} # {'Romanian': ('ro', 'rum', 'ron')}
|
| 16 |
+
# langs = list(favourite_langs.keys())
|
| 17 |
+
# langs.extend(list(all_langs.keys())) # Language options as list, add favourite languages first
|
| 18 |
# iso1_to_name = {codes[0]: lang for entry in all_langs for lang, codes in entry.items()} # {'ro': 'Romanian', 'de': 'German'}
|
| 19 |
iso1_to_name = {iso[1]: iso[0] for iso in non_empty_isos} # {'ro': 'Romanian', 'de': 'German'}
|
| 20 |
langs = {iso[0]: iso[1] for iso in non_empty_isos} # {'Romanian': 'ro', 'German': 'de'}
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
|
| 23 |
+
models = ["Helsinki-NLP", "Argos", "Google", "t5-base", "t5-small", "t5-large", "Unbabel/Tower-Plus-2B",
|
| 24 |
+
"Unbabel/TowerInstruct-Mistral-7B-v0.2", "winninghealth/WiNGPT-Babel-2"]
|
| 25 |
allmodels = ["Helsinki-NLP",
|
| 26 |
"Helsinki-NLP/opus-mt-tc-bible-big-mul-mul", "Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_nld",
|
| 27 |
"Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-mul",
|
|
|
|
| 49 |
|
| 50 |
def google(self):
|
| 51 |
url = os.environ['GCLIENT'] + f'sl={self.sl}&tl={self.tl}&q={self.input_text}'
|
| 52 |
+
response = httpx.get(url)
|
| 53 |
return response.json()[0][0][0]
|
| 54 |
+
|
| 55 |
@classmethod
|
| 56 |
def download_argos_model(cls, from_code, to_code):
|
| 57 |
import argostranslate.package
|
|
|
|
| 241 |
translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl)
|
| 242 |
translated_text = translator(self.input_text, max_length=512)
|
| 243 |
return translated_text[0]['translation_text']
|
| 244 |
+
|
| 245 |
def wingpt(self):
|
| 246 |
model = AutoModelForCausalLM.from_pretrained(
|
| 247 |
self.model_name,
|
| 248 |
+
dtype="auto",
|
| 249 |
device_map="auto"
|
| 250 |
)
|
| 251 |
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
|
|
|
| 377 |
)
|
| 378 |
argostranslate.package.install_from_path(package_to_install.download())
|
| 379 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
def translate_text(model_name: str, s_language: str, t_language: str, input_text: str) -> tuple[str, str]:
|
| 381 |
"""
|
| 382 |
Translates the input text from the source language to the target language using a specified model.
|
|
|
|
| 531 |
# Handle the submit button click
|
| 532 |
if submit_button:
|
| 533 |
with st.spinner("Translating...", show_time=True):
|
| 534 |
+
translated_text, message = translate_text(model_name, sselected_language, tselected_language, input_text)
|
| 535 |
# if model_name.startswith('Helsinki-NLP'):
|
| 536 |
# # input_ids = tokenizer.encode(input_text, return_tensors='pt')
|
| 537 |
# # # Perform translation
|
|
|
|
| 552 |
# translation = pipe(input_text)
|
| 553 |
# translated_text = translation[0]['translation_text']
|
| 554 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 555 |
# elif model_name.startswith('t5'):
|
| 556 |
# tokenizer = T5Tokenizer.from_pretrained(model_name)
|
| 557 |
# model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
|
|
|
|
| 583 |
# translated_text = f"No Argos model for {sselected_language} to {tselected_language}. Try other model or languages combination!"
|
| 584 |
# except Exception as error:
|
| 585 |
# translated_text = error
|
|
|
|
|
|
|
| 586 |
|
| 587 |
# Display the translated text
|
| 588 |
print(f"Translated from {sselected_language} to {tselected_language} using {model_name}.", input_text, translated_text)
|