import re
import unicodedata

def preprocess_text(text):
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"http\S+|www\.\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^\wÀ-ÖØ-öø-ÿ?!,. ]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text