Spaces:
Sleeping
Sleeping
| import re | |
| import unicodedata | |
| def preprocess_text(text): | |
| text = unicodedata.normalize("NFKC", text) | |
| text = re.sub(r"http\S+|www\.\S+", "", text) | |
| text = re.sub(r"<.*?>", "", text) | |
| text = re.sub(r"[^\wÀ-ÖØ-öø-ÿ?!,. ]", " ", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |