Spaces:
Runtime error
Runtime error
Takateru Yamakoshi
commited on
Commit
·
b5a5fbe
1
Parent(s):
8004d5f
add tokenizers
Browse files
app.py
CHANGED
|
@@ -6,19 +6,17 @@ import io
|
|
| 6 |
import time
|
| 7 |
|
| 8 |
@st.cache(show_spinner=True,allow_output_mutation=True)
|
| 9 |
-
def load_model(
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
from transformers import AlbertTokenizer
|
| 21 |
-
tokenizer = AlbertTokenizer.from_pretrained(model_name)
|
| 22 |
return tokenizer
|
| 23 |
|
| 24 |
def generate_markdown(text,color='black',font='Arial',size=20):
|
|
@@ -26,10 +24,11 @@ def generate_markdown(text,color='black',font='Arial',size=20):
|
|
| 26 |
|
| 27 |
def TokenizeText(sentence,tokenizer_name):
|
| 28 |
if len(sentence)>0:
|
| 29 |
-
if tokenizer_name.startswith('gpt2'):
|
| 30 |
-
|
| 31 |
-
else:
|
| 32 |
-
|
|
|
|
| 33 |
encoded_sent = [str(token) for token in input_sent]
|
| 34 |
decoded_sent = [tokenizer.decode([token]) for token in input_sent]
|
| 35 |
num_tokens = len(decoded_sent)
|
|
@@ -100,10 +99,8 @@ if __name__=='__main__':
|
|
| 100 |
# Select and load the tokenizer
|
| 101 |
st.sidebar.write('1. Choose the tokenizer from below')
|
| 102 |
tokenizer_name = st.sidebar.selectbox('',
|
| 103 |
-
(
|
| 104 |
-
|
| 105 |
-
'roberta-base','roberta-large',
|
| 106 |
-
'albert-base-v2','albert-xxlarge-v2'),index=7)
|
| 107 |
tokenizer = load_model(tokenizer_name)
|
| 108 |
|
| 109 |
st.sidebar.write('2. Optional settings')
|
|
@@ -135,10 +132,11 @@ if __name__=='__main__':
|
|
| 135 |
|
| 136 |
else:
|
| 137 |
if detokenize:
|
| 138 |
-
if tokenizer_name.startswith('gpt2'):
|
| 139 |
-
|
| 140 |
-
else:
|
| 141 |
-
|
|
|
|
| 142 |
sentence = st.text_input(f'Tokenized IDs',value=' '.join([str(token) for token in default_tokens]))
|
| 143 |
num_tokens = DeTokenizeText(sentence)
|
| 144 |
else:
|
|
|
|
| 6 |
import time
|
| 7 |
|
| 8 |
@st.cache(show_spinner=True,allow_output_mutation=True)
|
| 9 |
+
def load_model(tokenizer_name):
|
| 10 |
+
from transformers import AutoTokenizer
|
| 11 |
+
model_name_dict = {
|
| 12 |
+
"BERT":"bert-base-uncased",
|
| 13 |
+
"RoBERTa":"roberta-base",
|
| 14 |
+
"ALBERT":"albert-v2-base",
|
| 15 |
+
"GPT2":"gpt2",
|
| 16 |
+
"Llama":"meta-lama/Llama-2-7b-chat-hf",
|
| 17 |
+
"Gemma":"google/gemma-7b",
|
| 18 |
+
}
|
| 19 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name_dict[tokenizer_name])
|
|
|
|
|
|
|
| 20 |
return tokenizer
|
| 21 |
|
| 22 |
def generate_markdown(text,color='black',font='Arial',size=20):
|
|
|
|
| 24 |
|
| 25 |
def TokenizeText(sentence,tokenizer_name):
|
| 26 |
if len(sentence)>0:
|
| 27 |
+
#if tokenizer_name.startswith('gpt2'):
|
| 28 |
+
# input_sent = tokenizer(sentence)['input_ids']
|
| 29 |
+
#else:
|
| 30 |
+
# input_sent = tokenizer(sentence)['input_ids'][1:-1]
|
| 31 |
+
input_sent = tokenizer(sentence)['input_ids']
|
| 32 |
encoded_sent = [str(token) for token in input_sent]
|
| 33 |
decoded_sent = [tokenizer.decode([token]) for token in input_sent]
|
| 34 |
num_tokens = len(decoded_sent)
|
|
|
|
| 99 |
# Select and load the tokenizer
|
| 100 |
st.sidebar.write('1. Choose the tokenizer from below')
|
| 101 |
tokenizer_name = st.sidebar.selectbox('',
|
| 102 |
+
("BERT","RoBERTa","ALBERT",
|
| 103 |
+
"GPT2","Llama","Gemma"))
|
|
|
|
|
|
|
| 104 |
tokenizer = load_model(tokenizer_name)
|
| 105 |
|
| 106 |
st.sidebar.write('2. Optional settings')
|
|
|
|
| 132 |
|
| 133 |
else:
|
| 134 |
if detokenize:
|
| 135 |
+
#if tokenizer_name.startswith('gpt2'):
|
| 136 |
+
# default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids']
|
| 137 |
+
#else:
|
| 138 |
+
# default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids'][1:-1]
|
| 139 |
+
default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids']
|
| 140 |
sentence = st.text_input(f'Tokenized IDs',value=' '.join([str(token) for token in default_tokens]))
|
| 141 |
num_tokens = DeTokenizeText(sentence)
|
| 142 |
else:
|