SERASA_BERT_OCR / app /preprocess.py
Zevir's picture
teste
4d16182
raw
history blame contribute delete
312 Bytes
import re
import unicodedata
def preprocess_text(text):
text = unicodedata.normalize("NFKC", text)
text = re.sub(r"http\S+|www\.\S+", "", text)
text = re.sub(r"<.*?>", "", text)
text = re.sub(r"[^\wÀ-ÖØ-öø-ÿ?!,. ]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text