Boris Malashenko commited on
Upload tokenizer
Browse files- tokenizer.json +36 -55
- tokenizer_config.json +0 -48
tokenizer.json
CHANGED
|
@@ -300,60 +300,6 @@
|
|
| 300 |
"normalized": false,
|
| 301 |
"special": true
|
| 302 |
},
|
| 303 |
-
{
|
| 304 |
-
"id": 50285,
|
| 305 |
-
"content": "[unused0]",
|
| 306 |
-
"single_word": false,
|
| 307 |
-
"lstrip": false,
|
| 308 |
-
"rstrip": false,
|
| 309 |
-
"normalized": true,
|
| 310 |
-
"special": false
|
| 311 |
-
},
|
| 312 |
-
{
|
| 313 |
-
"id": 50286,
|
| 314 |
-
"content": "[unused1]",
|
| 315 |
-
"single_word": false,
|
| 316 |
-
"lstrip": false,
|
| 317 |
-
"rstrip": false,
|
| 318 |
-
"normalized": true,
|
| 319 |
-
"special": false
|
| 320 |
-
},
|
| 321 |
-
{
|
| 322 |
-
"id": 50287,
|
| 323 |
-
"content": "[unused2]",
|
| 324 |
-
"single_word": false,
|
| 325 |
-
"lstrip": false,
|
| 326 |
-
"rstrip": false,
|
| 327 |
-
"normalized": true,
|
| 328 |
-
"special": false
|
| 329 |
-
},
|
| 330 |
-
{
|
| 331 |
-
"id": 50288,
|
| 332 |
-
"content": "[unused3]",
|
| 333 |
-
"single_word": false,
|
| 334 |
-
"lstrip": false,
|
| 335 |
-
"rstrip": false,
|
| 336 |
-
"normalized": true,
|
| 337 |
-
"special": false
|
| 338 |
-
},
|
| 339 |
-
{
|
| 340 |
-
"id": 50289,
|
| 341 |
-
"content": "[unused4]",
|
| 342 |
-
"single_word": false,
|
| 343 |
-
"lstrip": false,
|
| 344 |
-
"rstrip": false,
|
| 345 |
-
"normalized": true,
|
| 346 |
-
"special": false
|
| 347 |
-
},
|
| 348 |
-
{
|
| 349 |
-
"id": 50290,
|
| 350 |
-
"content": "[unused5]",
|
| 351 |
-
"single_word": false,
|
| 352 |
-
"lstrip": false,
|
| 353 |
-
"rstrip": false,
|
| 354 |
-
"normalized": true,
|
| 355 |
-
"special": false
|
| 356 |
-
},
|
| 357 |
{
|
| 358 |
"id": 50291,
|
| 359 |
"content": "[unused6]",
|
|
@@ -51454,7 +51400,18 @@
|
|
| 51454 |
" ": 50276,
|
| 51455 |
" ": 50277,
|
| 51456 |
" ": 50278,
|
| 51457 |
-
" ": 50279
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51458 |
},
|
| 51459 |
"merges": [
|
| 51460 |
[
|
|
@@ -251440,6 +251397,30 @@
|
|
| 251440 |
[
|
| 251441 |
"a",
|
| 251442 |
"que"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251443 |
]
|
| 251444 |
]
|
| 251445 |
}
|
|
|
|
| 300 |
"normalized": false,
|
| 301 |
"special": true
|
| 302 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
{
|
| 304 |
"id": 50291,
|
| 305 |
"content": "[unused6]",
|
|
|
|
| 51400 |
" ": 50276,
|
| 51401 |
" ": 50277,
|
| 51402 |
" ": 50278,
|
| 51403 |
+
" ": 50279,
|
| 51404 |
+
"[UNK]": 50280,
|
| 51405 |
+
"[CLS]": 50281,
|
| 51406 |
+
"[SEP]": 50282,
|
| 51407 |
+
"[PAD]": 50283,
|
| 51408 |
+
"[MASK]": 50284,
|
| 51409 |
+
"а": 50285,
|
| 51410 |
+
"е": 50286,
|
| 51411 |
+
"и": 50287,
|
| 51412 |
+
"н": 50288,
|
| 51413 |
+
"о": 50289,
|
| 51414 |
+
"ÑĤ": 50290
|
| 51415 |
},
|
| 51416 |
"merges": [
|
| 51417 |
[
|
|
|
|
| 251397 |
[
|
| 251398 |
"a",
|
| 251399 |
"que"
|
| 251400 |
+
],
|
| 251401 |
+
[
|
| 251402 |
+
"Ð",
|
| 251403 |
+
"°"
|
| 251404 |
+
],
|
| 251405 |
+
[
|
| 251406 |
+
"Ð",
|
| 251407 |
+
"µ"
|
| 251408 |
+
],
|
| 251409 |
+
[
|
| 251410 |
+
"Ð",
|
| 251411 |
+
"¸"
|
| 251412 |
+
],
|
| 251413 |
+
[
|
| 251414 |
+
"Ð",
|
| 251415 |
+
"½"
|
| 251416 |
+
],
|
| 251417 |
+
[
|
| 251418 |
+
"Ð",
|
| 251419 |
+
"¾"
|
| 251420 |
+
],
|
| 251421 |
+
[
|
| 251422 |
+
"Ñ",
|
| 251423 |
+
"Ĥ"
|
| 251424 |
]
|
| 251425 |
]
|
| 251426 |
}
|
tokenizer_config.json
CHANGED
|
@@ -264,54 +264,6 @@
|
|
| 264 |
"single_word": false,
|
| 265 |
"special": true
|
| 266 |
},
|
| 267 |
-
"50285": {
|
| 268 |
-
"content": "[unused0]",
|
| 269 |
-
"lstrip": false,
|
| 270 |
-
"normalized": true,
|
| 271 |
-
"rstrip": false,
|
| 272 |
-
"single_word": false,
|
| 273 |
-
"special": false
|
| 274 |
-
},
|
| 275 |
-
"50286": {
|
| 276 |
-
"content": "[unused1]",
|
| 277 |
-
"lstrip": false,
|
| 278 |
-
"normalized": true,
|
| 279 |
-
"rstrip": false,
|
| 280 |
-
"single_word": false,
|
| 281 |
-
"special": false
|
| 282 |
-
},
|
| 283 |
-
"50287": {
|
| 284 |
-
"content": "[unused2]",
|
| 285 |
-
"lstrip": false,
|
| 286 |
-
"normalized": true,
|
| 287 |
-
"rstrip": false,
|
| 288 |
-
"single_word": false,
|
| 289 |
-
"special": false
|
| 290 |
-
},
|
| 291 |
-
"50288": {
|
| 292 |
-
"content": "[unused3]",
|
| 293 |
-
"lstrip": false,
|
| 294 |
-
"normalized": true,
|
| 295 |
-
"rstrip": false,
|
| 296 |
-
"single_word": false,
|
| 297 |
-
"special": false
|
| 298 |
-
},
|
| 299 |
-
"50289": {
|
| 300 |
-
"content": "[unused4]",
|
| 301 |
-
"lstrip": false,
|
| 302 |
-
"normalized": true,
|
| 303 |
-
"rstrip": false,
|
| 304 |
-
"single_word": false,
|
| 305 |
-
"special": false
|
| 306 |
-
},
|
| 307 |
-
"50290": {
|
| 308 |
-
"content": "[unused5]",
|
| 309 |
-
"lstrip": false,
|
| 310 |
-
"normalized": true,
|
| 311 |
-
"rstrip": false,
|
| 312 |
-
"single_word": false,
|
| 313 |
-
"special": false
|
| 314 |
-
},
|
| 315 |
"50291": {
|
| 316 |
"content": "[unused6]",
|
| 317 |
"lstrip": false,
|
|
|
|
| 264 |
"single_word": false,
|
| 265 |
"special": true
|
| 266 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
"50291": {
|
| 268 |
"content": "[unused6]",
|
| 269 |
"lstrip": false,
|