Update app.py
Browse files
app.py
CHANGED
|
@@ -7,19 +7,18 @@ import httpx
|
|
| 7 |
|
| 8 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 9 |
# Language options and mappings
|
| 10 |
-
|
| 11 |
-
|
| 12 |
df = pl.read_parquet("isolanguages.parquet")
|
| 13 |
non_empty_isos = df.slice(1).filter(pl.col("ISO639-1") != "").rows()
|
| 14 |
# all_langs = languagecodes.iso_languages_byname
|
| 15 |
all_langs = {iso[0]: (iso[1], iso[2], iso[3]) for iso in non_empty_isos} # {'Romanian': ('ro', 'rum', 'ron')}
|
| 16 |
-
|
| 17 |
-
|
| 18 |
# iso1_to_name = {codes[0]: lang for entry in all_langs for lang, codes in entry.items()} # {'ro': 'Romanian', 'de': 'German'}
|
| 19 |
iso1_to_name = {iso[1]: iso[0] for iso in non_empty_isos} # {'ro': 'Romanian', 'de': 'German'}
|
| 20 |
langs = {iso[0]: iso[1] for iso in non_empty_isos} # {'Romanian': 'ro', 'German': 'de'}
|
| 21 |
|
| 22 |
-
|
| 23 |
models = ["Helsinki-NLP", "QUICKMT", "Argos", "Google", "HPLT", "t5-base", "t5-small", "t5-large",
|
| 24 |
"utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct",
|
| 25 |
"Unbabel/Tower-Plus-2B", "Unbabel/TowerInstruct-7B-v0.2", "Unbabel/TowerInstruct-Mistral-7B-v0.2",
|
|
@@ -139,9 +138,26 @@ class Translators:
|
|
| 139 |
return translated_text
|
| 140 |
|
| 141 |
def hunyuan(self):
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
def HelsinkiNLP_mulroa(self):
|
| 147 |
try:
|
|
@@ -564,9 +580,9 @@ input_text = st.text_area("Enter text to translate:", placeholder="Enter text to
|
|
| 564 |
|
| 565 |
# Initialize session state if not already set
|
| 566 |
if "sselected_language" not in st.session_state:
|
| 567 |
-
st.session_state["sselected_language"] =
|
| 568 |
if "tselected_language" not in st.session_state:
|
| 569 |
-
st.session_state["tselected_language"] =
|
| 570 |
if "model_name" not in st.session_state:
|
| 571 |
st.session_state["model_name"] = models[1]
|
| 572 |
|
|
@@ -578,8 +594,8 @@ model_name = st.selectbox("Select a model:", models,
|
|
| 578 |
scol, swapcol, tcol = st.columns([3, 1, 3])
|
| 579 |
|
| 580 |
with scol:
|
| 581 |
-
sselected_language = st.selectbox("Source language:",
|
| 582 |
-
index=
|
| 583 |
with swapcol:
|
| 584 |
if st.button("π Swap"):
|
| 585 |
st.session_state["model_name"] = model_name # Preserve model
|
|
@@ -587,8 +603,8 @@ with swapcol:
|
|
| 587 |
st.session_state["tselected_language"], st.session_state["sselected_language"]
|
| 588 |
st.rerun()
|
| 589 |
with tcol:
|
| 590 |
-
tselected_language = st.selectbox("Target language:",
|
| 591 |
-
index=
|
| 592 |
|
| 593 |
# Language codes
|
| 594 |
sl = langs[st.session_state["sselected_language"]]
|
|
|
|
| 7 |
|
| 8 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 9 |
# Language options and mappings
|
| 10 |
+
favourite_langs = {"Romanian": "ro", "German": "de", "English": "en", "-----": "-----"}
|
| 11 |
+
# langs = ["German", "Romanian", "English", "French", "Spanish", "Italian",]
|
| 12 |
df = pl.read_parquet("isolanguages.parquet")
|
| 13 |
non_empty_isos = df.slice(1).filter(pl.col("ISO639-1") != "").rows()
|
| 14 |
# all_langs = languagecodes.iso_languages_byname
|
| 15 |
all_langs = {iso[0]: (iso[1], iso[2], iso[3]) for iso in non_empty_isos} # {'Romanian': ('ro', 'rum', 'ron')}
|
| 16 |
+
langs = list(favourite_langs.keys())
|
| 17 |
+
langs.extend(list(all_langs.keys())) # Language options as list, add favourite languages first
|
| 18 |
# iso1_to_name = {codes[0]: lang for entry in all_langs for lang, codes in entry.items()} # {'ro': 'Romanian', 'de': 'German'}
|
| 19 |
iso1_to_name = {iso[1]: iso[0] for iso in non_empty_isos} # {'ro': 'Romanian', 'de': 'German'}
|
| 20 |
langs = {iso[0]: iso[1] for iso in non_empty_isos} # {'Romanian': 'ro', 'German': 'de'}
|
| 21 |
|
|
|
|
| 22 |
models = ["Helsinki-NLP", "QUICKMT", "Argos", "Google", "HPLT", "t5-base", "t5-small", "t5-large",
|
| 23 |
"utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct",
|
| 24 |
"Unbabel/Tower-Plus-2B", "Unbabel/TowerInstruct-7B-v0.2", "Unbabel/TowerInstruct-Mistral-7B-v0.2",
|
|
|
|
| 138 |
return translated_text
|
| 139 |
|
| 140 |
def hunyuan(self):
|
| 141 |
+
ZH_CODES = {"Chinese": "zh", "Traditional Chinese": "zh-Hant", "Cantonese": "yue"}
|
| 142 |
+
if self.sl in ZH_CODES.keys() or self.tl in ZH_CODES.keys():
|
| 143 |
+
prompt = f"ζδΈι’ηζζ¬ηΏ»θ―ζ{self.tl}οΌδΈθ¦ι’ε€θ§£ιγ\n\n{self.input_text}"
|
| 144 |
+
else:
|
| 145 |
+
prompt = f"Translate the following segment into {self.tl}, without additional explanation.\n\n{self.input_text}."
|
| 146 |
+
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 147 |
+
model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto")
|
| 148 |
+
messages = [{"role": "user", "content": prompt}]
|
| 149 |
+
tokenized_chat = tokenizer.apply_chat_template(
|
| 150 |
+
messages,
|
| 151 |
+
tokenize=True,
|
| 152 |
+
add_generation_prompt=False,
|
| 153 |
+
return_tensors="pt",
|
| 154 |
+
"top_k": 20,
|
| 155 |
+
"top_p": 0.6,
|
| 156 |
+
"repetition_penalty": 1.05,
|
| 157 |
+
"temperature": 0.7
|
| 158 |
+
)
|
| 159 |
+
outputs = model.generate(tokenized_chat.to(model.device), max_new_tokens=512)
|
| 160 |
+
return output_text = tokenizer.decode(outputs[0])
|
| 161 |
|
| 162 |
def HelsinkiNLP_mulroa(self):
|
| 163 |
try:
|
|
|
|
| 580 |
|
| 581 |
# Initialize session state if not already set
|
| 582 |
if "sselected_language" not in st.session_state:
|
| 583 |
+
st.session_state["sselected_language"] = langs[0]
|
| 584 |
if "tselected_language" not in st.session_state:
|
| 585 |
+
st.session_state["tselected_language"] = langs[1]
|
| 586 |
if "model_name" not in st.session_state:
|
| 587 |
st.session_state["model_name"] = models[1]
|
| 588 |
|
|
|
|
| 594 |
scol, swapcol, tcol = st.columns([3, 1, 3])
|
| 595 |
|
| 596 |
with scol:
|
| 597 |
+
sselected_language = st.selectbox("Source language:", langs,
|
| 598 |
+
index=langs.index(st.session_state["sselected_language"]))
|
| 599 |
with swapcol:
|
| 600 |
if st.button("π Swap"):
|
| 601 |
st.session_state["model_name"] = model_name # Preserve model
|
|
|
|
| 603 |
st.session_state["tselected_language"], st.session_state["sselected_language"]
|
| 604 |
st.rerun()
|
| 605 |
with tcol:
|
| 606 |
+
tselected_language = st.selectbox("Target language:", langs,
|
| 607 |
+
index=langs.index(st.session_state["tselected_language"]))
|
| 608 |
|
| 609 |
# Language codes
|
| 610 |
sl = langs[st.session_state["sselected_language"]]
|