| | --- |
| | license: cc-by-4.0 |
| | datasets: |
| | - AnnaWegmann/Paraphrase-In-Interviews |
| | language: |
| | - en |
| | base_model: |
| | - microsoft/deberta-v3-large |
| | --- |
| | |
| | Model was created as described in https://arxiv.org/abs/2404.06670 , this is the best `DeBERTa ALL` model. See also the [GitHub](https://github.com/nlpsoc/Paraphrases-in-News-Interviews) repository. |
| |
|
| | ```python |
| | from transformers import AutoTokenizer, AutoModelForTokenClassification |
| | import torch |
| | |
| | class ParaphraseHighlighter: |
| | def __init__(self, model_name="AnnaWegmann/Highlight-Paraphrases-in-Dialog-ALL"): |
| | # Load the tokenizer and model |
| | self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) |
| | self.model = AutoModelForTokenClassification.from_pretrained(model_name) |
| | |
| | # Get the label id for 'LABEL_1' |
| | self.label2id = self.model.config.label2id |
| | self.label_id = self.label2id['LABEL_1'] |
| | |
| | def highlight_paraphrase(self, text1, text2): |
| | # Tokenize the inputs with the tokenizer |
| | encoding = self.tokenizer(text1, text2, return_tensors="pt", padding=True, truncation=True) |
| | |
| | outputs = self.model(**encoding) |
| | logits = outputs.logits # Shape: (batch_size, sequence_length, num_labels) |
| | # Apply softmax to get probabilities, automatically places [SEP] token |
| | probs = torch.nn.functional.softmax(logits, dim=-1) # Shape: (batch_size, sequence_length, num_labels) |
| | |
| | # Convert token IDs back to tokens |
| | tokens = self.tokenizer.convert_ids_to_tokens(encoding["input_ids"][0]) |
| | # Get word IDs to map tokens to words |
| | word_ids = encoding.word_ids(batch_index=0) |
| | # Get sequence IDs to know which text the token belongs to |
| | sequence_ids = encoding.sequence_ids(batch_index=0) |
| | |
| | # Collect words and probabilities for each text |
| | words_text1 = [] |
| | words_text2 = [] |
| | probs_text1 = [] |
| | probs_text2 = [] |
| | |
| | previous_word_idx = None |
| | |
| | # For determining if there are high-probability words in both texts |
| | has_high_prob_text1 = False |
| | has_high_prob_text2 = False |
| | |
| | for idx, (word_idx, seq_id) in enumerate(zip(word_ids, sequence_ids)): |
| | if word_idx is None: |
| | # Skip special tokens like [CLS], [SEP], [PAD] |
| | continue |
| | |
| | if word_idx != previous_word_idx: |
| | # Start of a new word |
| | word_tokens = [tokens[idx]] |
| | |
| | # Get the probability for LABEL_1 for the first token of the word |
| | prob_LABEL_1 = probs[0][idx][self.label_id].item() |
| | |
| | # Collect subsequent tokens belonging to the same word |
| | j = idx + 1 |
| | while j < len(word_ids) and word_ids[j] == word_idx: |
| | word_tokens.append(tokens[j]) |
| | j += 1 |
| | |
| | # Reconstruct the word |
| | word = self.tokenizer.convert_tokens_to_string(word_tokens).strip() |
| | |
| | # Check if probability >= 0.5 to uppercase |
| | if prob_LABEL_1 >= 0.5: |
| | word_display = word.upper() |
| | if seq_id == 0: |
| | has_high_prob_text1 = True |
| | elif seq_id == 1: |
| | has_high_prob_text2 = True |
| | else: |
| | word_display = word |
| | |
| | # Append the word and probability to the appropriate list |
| | if seq_id == 0: |
| | words_text1.append(word_display) |
| | probs_text1.append(prob_LABEL_1) |
| | elif seq_id == 1: |
| | words_text2.append(word_display) |
| | probs_text2.append(prob_LABEL_1) |
| | else: |
| | # Should not happen |
| | pass |
| | |
| | previous_word_idx = word_idx |
| | |
| | # Determine if there are words in both texts with prob >= 0.5 |
| | if has_high_prob_text1 and has_high_prob_text2: |
| | print("is a paraphrase") |
| | else: |
| | print("is not a paraphrase") |
| | |
| | # Function to format and align words and probabilities |
| | def print_aligned(words, probs): |
| | # Determine the maximum length of words for formatting |
| | max_word_length = max(len(word) for word in words) |
| | # Create format string for alignment |
| | format_str = f'{{:<{max_word_length}}}' |
| | # Print words |
| | for word in words: |
| | print(format_str.format(word), end=' ') |
| | print() |
| | # Print probabilities aligned below words |
| | for prob in probs: |
| | prob_str = f"{prob:.2f}" |
| | print(format_str.format(prob_str), end=' ') |
| | print('\n') |
| | |
| | # Print text1's words and probabilities aligned |
| | print("\nSpeaker 1:") |
| | print_aligned(words_text1, probs_text1) |
| | |
| | # Print text2's words and probabilities aligned |
| | print("Speaker 2:") |
| | print_aligned(words_text2, probs_text2) |
| | |
| | # Example usage |
| | highlighter = ParaphraseHighlighter() |
| | text1 = "And it will be my 20th time in doing it as a television commentator from Rome so." |
| | text2 = "Yes, you've been doing this for a while now." |
| | highlighter.highlight_paraphrase(text1, text2) |
| | ``` |
| |
|
| | should return |
| |
|
| | ``` |
| | is a paraphrase |
| | |
| | Speaker 1: |
| | And it will be my 20TH TIME IN DOING IT as a television commentator from Rome so. |
| | 0.06 0.38 0.35 0.37 0.45 0.60 0.51 0.51 0.51 0.59 0.38 0.37 0.42 0.38 0.24 0.26 0.14 |
| | |
| | Speaker 2: |
| | Yes, YOU'VE BEEN DOING THIS FOR A WHILE now. |
| | 0.07 0.60 0.65 0.63 0.68 0.62 0.60 0.64 0.48 |
| | ``` |
| |
|
| | For comments or questions reach out to Anna (a.m.wegmann @ uu.nl) or raise an issue on GitHub. |
| |
|
| | If you find this model helpful, consider citing our paper: |
| |
|
| | ``` |
| | @inproceedings{wegmann-etal-2024-whats, |
| | title = "What{'}s Mine becomes Yours: Defining, Annotating and Detecting Context-Dependent Paraphrases in News Interview Dialogs", |
| | author = "Wegmann, Anna and |
| | Broek, Tijs A. Van Den and |
| | Nguyen, Dong", |
| | editor = "Al-Onaizan, Yaser and |
| | Bansal, Mohit and |
| | Chen, Yun-Nung", |
| | booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing", |
| | month = nov, |
| | year = "2024", |
| | address = "Miami, Florida, USA", |
| | publisher = "Association for Computational Linguistics", |
| | url = "https://aclanthology.org/2024.emnlp-main.52", |
| | pages = "882--912", |
| | abstract = "Best practices for high conflict conversations like counseling or customer support almost always include recommendations to paraphrase the previous speaker. Although paraphrase classification has received widespread attention in NLP, paraphrases are usually considered independent from context, and common models and datasets are not applicable to dialog settings. In this work, we investigate paraphrases across turns in dialog (e.g., Speaker 1: {``}That book is mine.{''} becomes Speaker 2: {``}That book is yours.{''}). We provide an operationalization of context-dependent paraphrases, and develop a training for crowd-workers to classify paraphrases in dialog. We introduce ContextDeP, a dataset with utterance pairs from NPR and CNN news interviews annotated for context-dependent paraphrases. To enable analyses on label variation, the dataset contains 5,581 annotations on 600 utterance pairs. We present promising results with in-context learning and with token classification models for automatic paraphrase detection in dialog.", |
| | } |
| | ``` |