TiberiuCristianLeon commited on
Commit
cd52fb2
·
verified ·
1 Parent(s): 7270757

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -51
app.py CHANGED
@@ -13,15 +13,15 @@ df = pl.read_parquet("isolanguages.parquet")
13
  non_empty_isos = df.slice(1).filter(pl.col("ISO639-1") != "").rows()
14
  # all_langs = languagecodes.iso_languages_byname
15
  all_langs = {iso[0]: (iso[1], iso[2], iso[3]) for iso in non_empty_isos} # {'Romanian': ('ro', 'rum', 'ron')}
 
 
16
  # iso1_to_name = {codes[0]: lang for entry in all_langs for lang, codes in entry.items()} # {'ro': 'Romanian', 'de': 'German'}
17
  iso1_to_name = {iso[1]: iso[0] for iso in non_empty_isos} # {'ro': 'Romanian', 'de': 'German'}
18
  langs = {iso[0]: iso[1] for iso in non_empty_isos} # {'Romanian': 'ro', 'German': 'de'}
19
- # langs = {"English": "en", "Romanian": "ro", "German": "de", "French": "fr", "Spanish": "es", "Italian": "it"}
20
- # langs = list(favourite_langs.keys())
21
- # langs.extend(list(all_langs.keys())) # Language options as list, add favourite languages first
22
 
23
- models = ["Helsinki-NLP", "Argos", "t5-base", "t5-small", "t5-large", "Unbabel/Tower-Plus-2B",
24
- "Unbabel/TowerInstruct-Mistral-7B-v0.2", "winninghealth/WiNGPT-Babel-2", "Google"]
 
25
  allmodels = ["Helsinki-NLP",
26
  "Helsinki-NLP/opus-mt-tc-bible-big-mul-mul", "Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_nld",
27
  "Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-mul",
@@ -49,9 +49,9 @@ class Translators:
49
 
50
  def google(self):
51
  url = os.environ['GCLIENT'] + f'sl={self.sl}&tl={self.tl}&q={self.input_text}'
52
- response = requests.get(url)
53
  return response.json()[0][0][0]
54
-
55
  @classmethod
56
  def download_argos_model(cls, from_code, to_code):
57
  import argostranslate.package
@@ -241,11 +241,11 @@ class Translators:
241
  translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl)
242
  translated_text = translator(self.input_text, max_length=512)
243
  return translated_text[0]['translation_text']
244
-
245
  def wingpt(self):
246
  model = AutoModelForCausalLM.from_pretrained(
247
  self.model_name,
248
- torch_dtype="auto",
249
  device_map="auto"
250
  )
251
  tokenizer = AutoTokenizer.from_pretrained(self.model_name)
@@ -377,40 +377,6 @@ def download_argos_model(from_code, to_code):
377
  )
378
  argostranslate.package.install_from_path(package_to_install.download())
379
 
380
- def wingpt(model_name, sl, tl, input_text):
381
- model = AutoModelForCausalLM.from_pretrained(
382
- model_name,
383
- torch_dtype="auto",
384
- device_map="auto"
385
- )
386
- tokenizer = AutoTokenizer.from_pretrained(model_name)
387
- input_json = '{"input_text": input_text}'
388
- messages = [
389
- {"role": "system", "content": f"Translate this to {tl} language"},
390
- {"role": "user", "content": input_text}
391
- ]
392
-
393
- text = tokenizer.apply_chat_template(
394
- messages,
395
- tokenize=False,
396
- add_generation_prompt=True
397
- )
398
- model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
399
-
400
- generated_ids = model.generate(
401
- **model_inputs,
402
- max_new_tokens=512,
403
- temperature=0.1
404
- )
405
-
406
- generated_ids = [
407
- output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
408
- ]
409
- print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True))
410
- rawresult = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
411
- result = rawresult.split('\n')[-1].strip() if '\n' in rawresult else rawresult.strip()
412
- return result
413
-
414
  def translate_text(model_name: str, s_language: str, t_language: str, input_text: str) -> tuple[str, str]:
415
  """
416
  Translates the input text from the source language to the target language using a specified model.
@@ -565,7 +531,7 @@ with st.container(border=None, width="stretch", height="content", horizontal=Fal
565
  # Handle the submit button click
566
  if submit_button:
567
  with st.spinner("Translating...", show_time=True):
568
- translated_text, message = translate_text(model_name, sselected_language, sselected_language, input_text)
569
  # if model_name.startswith('Helsinki-NLP'):
570
  # # input_ids = tokenizer.encode(input_text, return_tensors='pt')
571
  # # # Perform translation
@@ -586,11 +552,6 @@ if submit_button:
586
  # translation = pipe(input_text)
587
  # translated_text = translation[0]['translation_text']
588
 
589
- # elif model_name.startswith('Google'):
590
- # url = os.environ['GCLIENT'] + f'sl={sl}&tl={tl}&q={input_text}'
591
- # response = httpx.get(url)
592
- # translated_text = response.json()[0][0][0]
593
- # print(response.json()[0][0])
594
  # elif model_name.startswith('t5'):
595
  # tokenizer = T5Tokenizer.from_pretrained(model_name)
596
  # model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
@@ -622,8 +583,6 @@ if submit_button:
622
  # translated_text = f"No Argos model for {sselected_language} to {tselected_language}. Try other model or languages combination!"
623
  # except Exception as error:
624
  # translated_text = error
625
- # elif model_name == "winninghealth/WiNGPT-Babel-2":
626
- # translated_text = wingpt(model_name, sselected_language, tselected_language, input_text)
627
 
628
  # Display the translated text
629
  print(f"Translated from {sselected_language} to {tselected_language} using {model_name}.", input_text, translated_text)
 
13
  non_empty_isos = df.slice(1).filter(pl.col("ISO639-1") != "").rows()
14
  # all_langs = languagecodes.iso_languages_byname
15
  all_langs = {iso[0]: (iso[1], iso[2], iso[3]) for iso in non_empty_isos} # {'Romanian': ('ro', 'rum', 'ron')}
16
+ # langs = list(favourite_langs.keys())
17
+ # langs.extend(list(all_langs.keys())) # Language options as list, add favourite languages first
18
  # iso1_to_name = {codes[0]: lang for entry in all_langs for lang, codes in entry.items()} # {'ro': 'Romanian', 'de': 'German'}
19
  iso1_to_name = {iso[1]: iso[0] for iso in non_empty_isos} # {'ro': 'Romanian', 'de': 'German'}
20
  langs = {iso[0]: iso[1] for iso in non_empty_isos} # {'Romanian': 'ro', 'German': 'de'}
 
 
 
21
 
22
+
23
+ models = ["Helsinki-NLP", "Argos", "Google", "t5-base", "t5-small", "t5-large", "Unbabel/Tower-Plus-2B",
24
+ "Unbabel/TowerInstruct-Mistral-7B-v0.2", "winninghealth/WiNGPT-Babel-2"]
25
  allmodels = ["Helsinki-NLP",
26
  "Helsinki-NLP/opus-mt-tc-bible-big-mul-mul", "Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_nld",
27
  "Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-mul",
 
49
 
50
  def google(self):
51
  url = os.environ['GCLIENT'] + f'sl={self.sl}&tl={self.tl}&q={self.input_text}'
52
+ response = httpx.get(url)
53
  return response.json()[0][0][0]
54
+
55
  @classmethod
56
  def download_argos_model(cls, from_code, to_code):
57
  import argostranslate.package
 
241
  translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl)
242
  translated_text = translator(self.input_text, max_length=512)
243
  return translated_text[0]['translation_text']
244
+
245
  def wingpt(self):
246
  model = AutoModelForCausalLM.from_pretrained(
247
  self.model_name,
248
+ dtype="auto",
249
  device_map="auto"
250
  )
251
  tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
377
  )
378
  argostranslate.package.install_from_path(package_to_install.download())
379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  def translate_text(model_name: str, s_language: str, t_language: str, input_text: str) -> tuple[str, str]:
381
  """
382
  Translates the input text from the source language to the target language using a specified model.
 
531
  # Handle the submit button click
532
  if submit_button:
533
  with st.spinner("Translating...", show_time=True):
534
+ translated_text, message = translate_text(model_name, sselected_language, tselected_language, input_text)
535
  # if model_name.startswith('Helsinki-NLP'):
536
  # # input_ids = tokenizer.encode(input_text, return_tensors='pt')
537
  # # # Perform translation
 
552
  # translation = pipe(input_text)
553
  # translated_text = translation[0]['translation_text']
554
 
 
 
 
 
 
555
  # elif model_name.startswith('t5'):
556
  # tokenizer = T5Tokenizer.from_pretrained(model_name)
557
  # model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
 
583
  # translated_text = f"No Argos model for {sselected_language} to {tselected_language}. Try other model or languages combination!"
584
  # except Exception as error:
585
  # translated_text = error
 
 
586
 
587
  # Display the translated text
588
  print(f"Translated from {sselected_language} to {tselected_language} using {model_name}.", input_text, translated_text)