Spaces:

VuvanAn
/

MedLLM-Assistant

Configuration error

App Files Files Community

VuvanAn commited on Aug 9, 2025

Commit

09dc9d3

verified ·

1 Parent(s): 5ba9d4c

Upload 47 files

Browse files

Files changed (48) hide show

.gitattributes +3 -0
README.md +20 -0
__pycache__/app.cpython-313.pyc +0 -0
__pycache__/utils.cpython-313.pyc +0 -0
app.py +136 -0
config.yaml +25 -0
knowledge/vectorstore_1/config.json +1 -0
knowledge/vectorstore_1/docs.pkl +3 -0
knowledge/vectorstore_1/index.faiss +3 -0
knowledge/vectorstore_1/index.pkl +3 -0
rag_pipeline/__init__.py +8 -0
rag_pipeline/__pycache__/__init__.cpython-313.pyc +0 -0
rag_pipeline/data_ingest/__pycache__/loader.cpython-313.pyc +0 -0
rag_pipeline/data_ingest/loader.py +40 -0
rag_pipeline/data_ingest/parser.py +0 -0
rag_pipeline/generation/__pycache__/llm_wrapper.cpython-313.pyc +0 -0
rag_pipeline/generation/__pycache__/prompt_template.cpython-313.pyc +0 -0
rag_pipeline/generation/llm_wrapper.py +59 -0
rag_pipeline/generation/prompt_template.py +115 -0
rag_pipeline/indexing/chunking/__pycache__/markdown.cpython-313.pyc +0 -0
rag_pipeline/indexing/chunking/__pycache__/recursive.cpython-313.pyc +0 -0
rag_pipeline/indexing/chunking/markdown.py +54 -0
rag_pipeline/indexing/chunking/recursive.py +30 -0
rag_pipeline/indexing/embedding/__pycache__/embedding.cpython-313.pyc +0 -0
rag_pipeline/indexing/embedding/embedding.py +23 -0
rag_pipeline/retrieval/__pycache__/reranker.cpython-313.pyc +0 -0
rag_pipeline/retrieval/__pycache__/vector_retriever.cpython-313.pyc +0 -0
rag_pipeline/retrieval/graph_retriever.py +4 -0
rag_pipeline/retrieval/hybrid_retriever.py +0 -0
rag_pipeline/retrieval/reranker.py +8 -0
rag_pipeline/retrieval/vector_retriever.py +38 -0
requirements.txt +0 -0
test/__pycache__/_normalize_qa.cpython-313.pyc +0 -0
test/__pycache__/data_ingest.cpython-313.pyc +0 -0
test/__pycache__/eval_lm.cpython-313.pyc +0 -0
test/__pycache__/eval_qa.cpython-313.pyc +0 -0
test/__pycache__/prepare_retrieve.cpython-313.pyc +0 -0
test/__pycache__/test_llm.cpython-313.pyc +0 -0
test/__pycache__/test_retrieve.cpython-313.pyc +0 -0
test/_normalize_qa.py +43 -0
test/chatbot_inference.py +23 -0
test/data_ingest.py +78 -0
test/eval_lm.py +87 -0
test/eval_qa.py +106 -0
test/prepare_retrieve.py +50 -0
test/test_llm.py +9 -0
test/test_retrieve.py +39 -0
utils.py +211 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,3 @@

+knowledge/vectorstore_1/docs.pkl filter=lfs diff=lfs merge=lfs -text
+knowledge/vectorstore_1/index.faiss filter=lfs diff=lfs merge=lfs -text
+knowledge/vectorstore_1/index.pkl filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+```
+python -m notebook.An.master.test.data_ingest
+--data_dir notebook/An/master/data \\
+--vectorstore_dir notebook/An/master/knowledge/vectorstore_1 \\
+--embed_model_name alibaba-nlp/gte-multilingual-base \\
+--chunking_strategy recursive \\
+--chunk_size 2048 \\
+--chunk_overlap 512 \\
+--vectorstore faiss
+```
+```
+python -m notebook.An.master.test.test_retrieve
+--query "Heart definition and heart disease"
+--vectorstore_dir notebook/An/master/knowledge/vectorstore_1 \\
+--embed_model_name alibaba-nlp/gte-multilingual-base \\
+--retriever_k 4 \\
+--metric cosine \\
+--threshold 0.5 \\
+```

__pycache__/app.cpython-313.pyc ADDED Viewed

Binary file (6.45 kB). View file

__pycache__/utils.cpython-313.pyc ADDED Viewed

Binary file (10.9 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import gradio as gr
+from datetime import datetime
+# Assuming these are in your project structure
+from .rag_pipeline import ChatAssistant, get_embeddings, vretrieve, retrieve_chatbot_prompt, request_retrieve_prompt
+from .utils import load_local
+# --- Constants and System Prompt ---
+# DEVELOPER: Add or remove models here.
+# The key is the display name in the dropdown.
+# The value is a tuple of (model_id, model_provider).
+AVAILABLE_MODELS = {
+    "mistral large (mistral)": ("mistral", "mistral"),
+    "mistral medium (mistral)": ("mistral-medium", "mistral"),
+    "mistral small (mistral)": ("mistral-small", "mistral"),
+    "llama3 8B" : ("llama3:8b", "ollama"),
+    "llama3.1 8B": ("llama3.1:8b", "ollama"),
+    "gpt-oss 20B": ("gpt-oss-20b", "ollama"),
+    "gemma3 12B": ("gemma3:12b", "ollama"),
+    "gpt 4o mini": ("gpt-4o-mini", "openai"),
+    "gpt 4o": ("gpt-4o", "openai"),
+}
+DEFAULT_MODEL_KEY = "mistral medium (mistral)"
+EMBEDDING_MODEL_ID = "alibaba-nlp/gte-multilingual-base"
+VECTORSTORE_PATH = "notebook/An/master/knowledge/vectorstore_full"
+LOG_FILE_PATH = "log.txt"
+MAX_HISTORY_CONVERSATION = 50
+# System prompt for the medical assistant
+sys = """
+You are an Medical Assistant specialized in providing information and answering questions related to healthcare and medicine.
+You must answer professionally and empathetically, taking into account the user's feelings and concerns.
+"""
+# --- Initial Setup (runs once) ---
+print("Initializing models and data...")
+embedding_model = get_embeddings(EMBEDDING_MODEL_ID, show_progress=False)
+vectorstore, docs = load_local(VECTORSTORE_PATH, embedding_model)
+print("Initialization complete.")
+# --- Helper Functions ---
+def log(log_txt: str):
+    """Appends a log entry to the log file."""
+    with open(LOG_FILE_PATH, "a", encoding="utf-8") as log_file:
+        log_file.write(log_txt + "\n")
+# --- Core Chatbot Logic ---
+def chatbot_logic(message: str, history: list, selected_model_key: str):
+    """
+    Handles the main logic for receiving a message, performing RAG, and generating a response.
+    """
+    # 1. Look up the model_id and model_provider from the selected key
+    model_id, model_provider = AVAILABLE_MODELS[selected_model_key]
+    log(f"** Current time **: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    log(f"** User message **: {message}")
+    log(f"** Using Model **: {model_id} ({model_provider})")
+    # Initialize the assistant with the specified model for this request
+    try:
+        chat_assistant = ChatAssistant(model_id, model_provider)
+    except Exception as e:
+        yield f"Error: Could not initialize the model. Please check the ID and provider. Details: {e}"
+        return
+    # --- RAG Pipeline ---
+    # 2. Format conversation history for context
+    history = history[-MAX_HISTORY_CONVERSATION:]
+    conversation = "".join(f"User: {user_msg}\nBot: {bot_msg}\n" for user_msg, bot_msg in history)
+    query_for_rag = conversation + f"User: {message}\nBot:"
+    # 3. Generate a search query from the conversation
+    rag_query = chat_assistant.get_response(request_retrieve_prompt.format(role="user", conversation=query_for_rag))
+    rag_query = rag_query[rag_query.lower().rfind("[") + 1: rag_query.rfind("]")]
+    # 4. Retrieve relevant documents if necessary
+    if "NO" not in rag_query:
+        retrieve_results = vretrieve(rag_query, vectorstore, docs, k=4, metric="mmr", threshold=0.7)
+    else:
+        retrieve_results = []
+    retrieved_docs = "\n".join([f"Document {i+1}:\n" + doc.page_content for i, doc in enumerate(retrieve_results)])
+    log(f"** RAG query **: {rag_query}")
+    log(f"** Retrieved documents **:\n{retrieved_docs}")
+    # --- Final Response Generation ---
+    # 5. Create the final prompt with retrieved context
+    final_prompt = retrieve_chatbot_prompt.format(role="user", documents=retrieved_docs, conversation=query_for_rag)
+    # 6. Stream the response from the LLM
+    response = ""
+    for token in chat_assistant.get_streaming_response(final_prompt, sys):
+        response += token
+        yield response
+    log(f"** Bot response **: {response}")
+    log("=" * 50 + "\n\n")
+# --- Gradio UI ---
+with gr.Blocks(theme="soft") as chatbot_ui:
+    gr.Markdown("# MedLLM")
+    model_selector = gr.Dropdown(
+        label="Select Model",
+        choices=list(AVAILABLE_MODELS.keys()),
+        value=DEFAULT_MODEL_KEY,
+    )
+    chatbot = gr.Chatbot(label="Chat Window", height=500, bubble_full_width=False)
+    msg_input = gr.Textbox(label="Your Message", placeholder="Type your question here and press Enter...", scale=7)
+    def respond(message, chat_history, selected_model_key):
+        """Wrapper function to connect chatbot_logic with Gradio's state."""
+        bot_message_stream = chatbot_logic(message, chat_history, selected_model_key)
+        chat_history.append([message, ""])
+        for token in bot_message_stream:
+            chat_history[-1][1] = token
+            yield chat_history
+    msg_input.submit(
+        respond,
+        [msg_input, chatbot, model_selector],
+        [chatbot]
+    ).then(
+        lambda: gr.update(value=""), None, [msg_input], queue=False
+    )
+# --- Launch the App ---
+if __name__ == "__main__":
+    chatbot_ui.launch(debug=True, share=False)

config.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+version: 0.1
+model:
+  name: "llama2:7b"
+  temperature: 0.3
+  max_tokens: 100000
+  provider: "ollama"
+  base_url: "http://localhost:11434/v1"
+rag_config:
+  k: 4
+  rerank:
+    name: "bge-reranker-large"
+    model: "BAAI/bge-reranker-large"
+    top_n: 100
+  embed_model:
+    name: "gte-multilingual-base"
+    model: "alibaba-nlp/gte-multilingual-base"
+  chunk_size: 2048
+  chunk_overlap: 512
+  similarity_threshold: 0.7
+  similarity_metric: "cosine"
+knowledge:
+  vectorstore: "faiss"

knowledge/vectorstore_1/config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"data_paths": ["dataset/RAG_Data/wiki_vi", "dataset/RAG_Data/youmed"], "vectorstore_dir": "notebook/An/master/knowledge/vectorstore_1", "file_type": "txt", "embed_model_name": "alibaba-nlp/gte-multilingual-base", "chunk_size": 2048, "chunk_overlap": 512, "chunk_method": "markdown", "vectorstore": "faiss", "clear_vectorstore": true}

knowledge/vectorstore_1/docs.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e969c1a0beb575363fc3cd0e252b9751f9ad79fc605ec6ab4a2c4ee68845e43
+size 7568017

knowledge/vectorstore_1/index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10ffba3c9fc6846d51de37463833eecf8b42b036a78e93e90ff779fbd47268f6
+size 9440301

knowledge/vectorstore_1/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9d81a73aa18b621f660a69e7ce3bba1b8b1875e983752a1e504f1f2922a7fdc
+size 7730542

rag_pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .generation.llm_wrapper import ChatAssistant
+from .indexing.chunking.recursive import split_document as recursive_chunking
+from .indexing.chunking.markdown import split_document as markdown_chunking
+from .indexing.embedding.embedding import get_embeddings
+from .data_ingest.loader import load_data
+from .generation.prompt_template import *
+from .retrieval.vector_retriever import retrieve as vretrieve
+from .retrieval.reranker import rerank

rag_pipeline/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (704 Bytes). View file

rag_pipeline/data_ingest/__pycache__/loader.cpython-313.pyc ADDED Viewed

Binary file (2.09 kB). View file

rag_pipeline/data_ingest/loader.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+from typing import List
+from langchain.schema import Document
+def load_data(data_path: str, file_type: str) -> List[Document]:
+    """
+    Load knowledge data from a specified path and file type.
+    Args:
+        data_path: The path to the data.
+        file_type: The type of the data.
+    Returns:
+        A list of documents.
+    """
+    if file_type == "pdf":
+        raise NotImplementedError("PDF loading is not yet implemented.")
+    elif file_type == "txt":
+        return _load_txt(data_path)
+def _load_txt(data_path: str) -> List[Document]:
+    splits = []
+    if not os.path.isdir(data_path):
+        raise FileNotFoundError(f"Error: Directory not found at {data_path}")
+    for file_name in os.listdir(data_path):
+        if file_name.endswith('.txt'):
+            file_path = os.path.join(data_path, file_name)
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                metadata = {"source": file_name}
+                doc = Document(page_content=content, metadata=metadata)
+                splits.append(doc)
+            except Exception as e:
+                print(f"Error reading file {file_path}: {e}")
+    return splits

rag_pipeline/data_ingest/parser.py ADDED Viewed

File without changes

rag_pipeline/generation/__pycache__/llm_wrapper.cpython-313.pyc ADDED Viewed

Binary file (2.9 kB). View file

rag_pipeline/generation/__pycache__/prompt_template.cpython-313.pyc ADDED Viewed

Binary file (3.99 kB). View file

rag_pipeline/generation/llm_wrapper.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from openai import OpenAI
+import backoff
+import os
+_base_url_ ={
+    "ollama": "http://localhost:11434/v1",
+    "mistral": "https://api.mistral.ai/v1",
+    "openai": "https://api.openai.com/v1",
+}
+_api_key_ = {
+    "ollama": "ollama",
+    "mistral": os.getenv("MISTRAL_API_KEY"),
+    "openai": os.getenv("OPENAI_API_KEY"),
+}
+class ChatAssistant:
+    def __init__(self, model_name:str, provider:str = "ollama"):
+        """
+        Args:
+            model_name: The name of the model to use.
+            provider: The provider of the model. Can be "ollama", "mistral", or "openai".
+        """
+        self.model_name = model_name
+        self.client = OpenAI(
+            base_url=_base_url_[provider],
+            api_key=_api_key_[provider],
+        )
+    @backoff.on_exception(backoff.expo, Exception)
+    def get_response(self, user: str, sys: str = ""):
+        response = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=[
+                    {"role": "system", "content": sys},
+                    {"role": "user", "content": user},
+                ]
+            )
+        return response.choices[0].message.content
+    @backoff.on_exception(backoff.expo, Exception)
+    def get_streaming_response(self, user: str, sys: str = ""):
+        """Yields the response token by token (streaming)."""
+        response_stream = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=[
+                {"role": "system", "content": sys},
+                {"role": "user", "content": user},
+            ],
+            stream=True
+        )
+        # Iterate over the stream of chunks
+        for chunk in response_stream:
+            # The actual token is in chunk.choices[0].delta.content
+            token = chunk.choices[0].delta.content
+            if token is not None:
+                yield token

rag_pipeline/generation/prompt_template.py ADDED Viewed

	@@ -0,0 +1,115 @@

+multichoice_qa_prompt = """
+-- DOCUMENT --
+{document}
+-- END OF DOCUMENT --
+-- INSTRUCTION --
+You are a medical expert.
+Given the documents, you must answer the question follow these step by step.
+First, you must read the question and the options, and draft an answer for it based on your knowledge.
+Second, you must read the documents and check if they can help answer the question.
+Third, you cross check the document with your knowledge and the draft answer.
+Finally, you answer the question based on your knowledge and the true documents.
+Your response must end with the letter of the most correct option like: "the answer is A".
+The entire thought must under 500 words long.
+-- END OF INSTRUCTION --
+-- QUESTION --
+{question}
+{options}
+-- END OF QUESTION --
+"""
+qa_prompt = """
+-- DOCUMENT --
+{document}
+-- END OF DOCUMENT --
+-- INSTRUCTION --
+You are a medical expert.
+Given the documents, you must answer the question follow these step by step.
+First, you must read the question and draft an answer for it based on your knowledge.
+Second, you must read the documents and check if they can help answer the question.
+Third, you cross check the document with your knowledge and the draft answer.
+Finally, you answer the question based on your knowledge and the true documents concisely.
+Your response must as shortest as possible, in Vietnamese and between brackets like: "[...]".
+-- END OF INSTRUCTION --
+-- QUESTION --
+{question}
+-- END OF QUESTION --
+"""
+retrieve_chatbot_prompt = """
+You are a medical expert.
+You are having a conversation with a {role} and you have an external documents to help you.
+Continue the conversation based on the chat history, the context information, and not prior knowledge.
+Before use the retrieved chunk, you must check if it is relevant to the user query. If it is not relevant, you must ignore it.
+You use the relevant chunk to answer the question and cite the source inside <<<>>>.
+If you don't know the answer, you must say "I don't know".
+---------------------
+{documents}
+---------------------
+Given the documents and not prior knowledge, continue the conversation.
+---------------------
+{conversation}
+---------------------
+"""
+request_retrieve_prompt = """
+--- INSTRUCTION ---
+You are having a conversation with a {role}.
+You have to provide a short query to retrieve the documents that you need inside the brackets like: "[...]".
+If it is something do not related to medical field, or something you do not need the external knowledge to answer, you must write "[NO]".
+--- END OF INSTRUCTION ---
+--- COVERSATION ---
+{conversation}
+--- END OF COVERSATION ---
+"""
+answer_prompt = """
+-- INSTRUCTION --
+You are a medical expert.
+Given the documents below, you must answer the question step by step.
+First, you must read the question.
+Second, you must read the documents and check for it's reliability.
+Third, you cross check with your knowledge.
+Finally, you answer the question based on your knowledge and the true documents.
+Your answer must UNDER 50 words, write on 1 line and write in Vietnamese.
+-- END OF INSTRUCTION --
+-- QUESTION --
+{question}
+-- END OF QUESTION --
+-- DOCUMENT --
+{document}
+-- END OF DOCUMENT --
+"""
+translate_prompt = """
+[ INSTRUCTION ]
+You are a Medical translator expert.
+Your task is to translate this English question into Vietnamese with EXACTLY the same format and write in 1 line.
+[ END OF INSTRUCTION ]
+[ QUERY TO TRANSLATE ]
+{query}
+[ END OF QUERY TO TRANSLATE ]
+"""
+pdf2txt_prompt = """
+Rewrite this plain text from pdf file follow the right reading order and these instructions:
+- Use markdown format.
+- Use same language.
+- Keep the content intact.
+- Beautify the table.
+- No talk.
+[ QUERY ]
+{query}
+[ END OF QUERY ]
+"""

rag_pipeline/indexing/chunking/__pycache__/markdown.cpython-313.pyc ADDED Viewed

Binary file (2.55 kB). View file

rag_pipeline/indexing/chunking/__pycache__/recursive.cpython-313.pyc ADDED Viewed

Binary file (1.52 kB). View file

rag_pipeline/indexing/chunking/markdown.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
+from langchain.schema import Document
+from typing import List
+def __split_1_document__(document: Document, chunk_size: int, chunk_overlap: int) -> List[Document]:
+    headers_to_split_on = [
+        ("#", "Header 1"),
+        ("##", "Header 2"),
+        ("###", "Header 3"),
+    ]
+    markdown_splitter = MarkdownHeaderTextSplitter(
+        headers_to_split_on=headers_to_split_on,
+        strip_headers=False,
+        return_each_line=False
+    )
+    md_header_splits = markdown_splitter.split_text(document.page_content)
+    for doc in md_header_splits:
+        doc.metadata.update(document.metadata)
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size, chunk_overlap=chunk_overlap
+    )
+    final_splits = text_splitter.split_documents(md_header_splits)
+    # Iterate through the final chunks to prepend metadata to the page_content
+    for i, doc in enumerate(final_splits):
+        header_lines = []
+        source_line = f"-- source: {doc.metadata.get('source', 'N/A')}"
+        if 'Header 1' in doc.metadata:
+            header_lines.append(doc.metadata['Header 1'])
+        if 'Header 2' in doc.metadata:
+            header_lines.append(doc.metadata['Header 2'])
+        if 'Header 3' in doc.metadata:
+            header_lines.append(doc.metadata['Header 3'])
+        header_content = "\n".join(header_lines)
+        chunk_header = f"Chunk {i+1}:"
+        # Combine everything into the new page content
+        original_content = doc.page_content
+        doc.page_content = f"{source_line}\n{header_content}\n{chunk_header}\n{original_content}"
+    return final_splits
+def split_document(documents: List[Document], chunk_size: int, chunk_overlap: int) -> List[Document]:
+    split_documents = []
+    for doc in documents:
+        split_documents.extend(__split_1_document__(doc, chunk_size, chunk_overlap))
+    return split_documents

rag_pipeline/indexing/chunking/recursive.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.schema import Document
+from typing import List
+def __split_1_document__(document: Document, chunk_size: int, chunk_overlap: int) -> List[Document]:
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+    )
+    text_content = document.page_content
+    text_chunks = text_splitter.split_text(text_content)
+    split_documents = []
+    for i, chunk in enumerate(text_chunks):
+        new_metadata = document.metadata.copy()
+        # new_metadata['chunk_number'] = i + 1
+        new_doc = Document(page_content=chunk, metadata=new_metadata)
+        split_documents.append(new_doc)
+    return split_documents
+def split_document(documents: List[Document], chunk_size: int, chunk_overlap: int) -> List[Document]:
+    split_documents = []
+    for doc in documents:
+        split_documents.extend(__split_1_document__(doc, chunk_size, chunk_overlap))
+    return split_documents

rag_pipeline/indexing/embedding/__pycache__/embedding.cpython-313.pyc ADDED Viewed

Binary file (1.06 kB). View file

rag_pipeline/indexing/embedding/embedding.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from langchain_huggingface import HuggingFaceEmbeddings
+import torch
+_model_cache = {}
+def get_embeddings(model_name: str, show_progress: bool = True) -> HuggingFaceEmbeddings:
+    """
+    Get the embeddings model. Cache available.
+    Args:
+        model_name: The name of the model.
+    Returns:
+        The embeddings model.
+    """
+    if model_name not in _model_cache:
+        embeddings = HuggingFaceEmbeddings(
+            model_name=model_name,
+            show_progress=show_progress,
+            model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu', 'trust_remote_code':True},
+            encode_kwargs={'batch_size': 15}
+        )
+        _model_cache[model_name] = embeddings
+    return _model_cache[model_name]

rag_pipeline/retrieval/__pycache__/reranker.cpython-313.pyc ADDED Viewed

Binary file (504 Bytes). View file

rag_pipeline/retrieval/__pycache__/vector_retriever.cpython-313.pyc ADDED Viewed

Binary file (2.25 kB). View file

rag_pipeline/retrieval/graph_retriever.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from typing import List, Any
+def retrieve(query: str, graphstore: Any = None) -> List[str]:
+    pass

rag_pipeline/retrieval/hybrid_retriever.py ADDED Viewed

File without changes

rag_pipeline/retrieval/reranker.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import os
+import pickle
+from typing import List
+from langchain.schema import Document
+def rerank(docs: List[Document]) -> List[Document]:
+    return docs

rag_pipeline/retrieval/vector_retriever.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from langchain_community.vectorstores import FAISS
+from langchain.schema import Document
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from .reranker import rerank
+from typing import List, Any
+def retrieve(query: str, vectorstore: FAISS, docs: List[Document] = None, k: int = 4, metric: str = "cosine", threshold: float = 0.5, reranker: Any = None) -> List[Document]:
+    """
+    Retrieve documents from the vectorstore based on the query and metric.
+    Args:
+       query: The query to search for.
+       metric: The metric to use for retrieval.
+       vectorstore: The vectorstore to search in.
+       k: The number of documents to retrieve.
+       threshold: The threshold for the metric to use for retrieval.
+       reranker: The reranker to use for reranking the retrieved documents.
+    Returns:
+       A list of documents.
+    """
+    if metric == "cosine":
+        docs = vectorstore.similarity_search_with_score(query, k=k)
+        docs = [doc for doc, score in docs if score > threshold]
+    elif metric == "mmr":
+        docs = vectorstore.max_marginal_relevance_search(query, k=k)
+    elif metric == "bm25":
+        from langchain_community.retrievers import BM25Retriever
+        if docs is None:
+            raise ValueError("Documents not available. BM25 requires ingested or loaded documents.")
+        bm25_retriever = BM25Retriever.from_documents(docs)
+        docs = bm25_retriever.get_relevant_documents(query, k=k)
+    else:
+        raise ValueError(f"Unsupported metric: '{metric}'. Supported metrics are 'similarity', 'mmr', and 'bm25'.")
+    if (reranker != None):
+        return rerank(docs)
+    return docs

requirements.txt ADDED Viewed

Binary file (11.9 kB). View file

test/__pycache__/_normalize_qa.cpython-313.pyc ADDED Viewed

Binary file (2.2 kB). View file

test/__pycache__/data_ingest.cpython-313.pyc ADDED Viewed

Binary file (3.98 kB). View file

test/__pycache__/eval_lm.cpython-313.pyc ADDED Viewed

Binary file (5.68 kB). View file

test/__pycache__/eval_qa.cpython-313.pyc ADDED Viewed

Binary file (6.52 kB). View file

test/__pycache__/prepare_retrieve.cpython-313.pyc ADDED Viewed

Binary file (3.71 kB). View file

test/__pycache__/test_llm.cpython-313.pyc ADDED Viewed

Binary file (603 Bytes). View file

test/__pycache__/test_retrieve.cpython-313.pyc ADDED Viewed

Binary file (2.32 kB). View file

test/_normalize_qa.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# import json
+# import uuid
+# origin_qa_data_path = 'dataset/QA Data/MedMCQA/hard_questions.jsonl'
+# target_qa_data_path = 'dataset/QA Data/MedMCQA/translated_hard_questions.jsonl'
+# def transform_id(origin_id):
+#     # Add 'T' prefix and remove last character
+#     return ' T' + origin_id[:-1]
+# def update_answers():
+#     # Read origin data
+#     with open(origin_qa_data_path, 'r', encoding='utf-8') as f:
+#         origin_data = [json.loads(line) for line in f]
+#     # Read target data
+#     with open(target_qa_data_path, 'r', encoding='utf-8') as f:
+#         target_data = [json.loads(line) for line in f]
+#     c = []
+#     for item in origin_data:
+#         for target_item in target_data:
+#             if transform_id(item['id']) == target_item['uuid']:
+#                 if item['cop'] == 0:
+#                     target_item['answer'] = 'A'
+#                 elif item['cop'] == 1:
+#                     target_item['answer'] = 'B'
+#                 elif item['cop'] == 2:
+#                     target_item['answer'] = 'C'
+#                 elif item['cop'] == 3:
+#                     target_item['answer'] = 'D'
+#                 c.extend([target_item['uuid']])
+#     # print(c)
+#     for item in target_data:
+#         if item['uuid'] not in c:
+#             print(item['uuid'])
+#     # Write updated target data back to file
+#     with open(target_qa_data_path, 'w', encoding='utf-8') as f:
+#         for item in target_data:
+#             f.write(json.dumps(item, ensure_ascii=False) + '\n')
+# # Call the function to update answers
+# update_answers()

test/chatbot_inference.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from rag_pipeline import get_embeddings, vretrieve, rerank
+from utils import load_local
+import argparse
+def inference():
+    embed_model = get_embeddings(args.embed_model_name)
+    vectorstore, docs = load_local(args.vectorstore_dir, embed_model)
+    retrieve_results = vretrieve(args.query, vectorstore, docs, args.retriever_k, args.metric, args.threshold)
+    retrieve_results = rerank(retrieve_results)
+    print(retrieve_results)
+def conversation():
+    while True:
+        query = input("User: ")
+        if query == "exit":
+            break
+        inference(query)
+if __name__ == '__main__':
+    conversation()

test/data_ingest.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import argparse
+import os
+from typing import List
+from ..rag_pipeline import get_embeddings, load_data
+from ..utils import load_local, save_local
+def main(args):
+    print(f"Log: {args}")
+    if args.clear_vectorstore:
+        import shutil
+        if os.path.isdir(args.vectorstore_dir):
+            shutil.rmtree(args.vectorstore_dir)
+    embed_model = get_embeddings(args.embed_model_name)
+    vectorstore, docs = load_local(args.vectorstore_dir, embed_model)
+    new_docs = []
+    for data_path in args.data_paths:
+        new_docs.extend(load_data(data_path, args.file_type))
+    print(f"Got {len(new_docs)} documents.")
+    if args.chunk_method == "recursive":
+        from ..rag_pipeline import recursive_chunking
+        new_docs = recursive_chunking(new_docs, args.chunk_size, args.chunk_overlap)
+    elif args.chunk_method == "markdown":
+        from ..rag_pipeline import markdown_chunking
+        new_docs = markdown_chunking(new_docs, args.chunk_size, args.chunk_overlap)
+    print(f"Got {len(new_docs)} chunks.")
+    from langchain_community.vectorstores import FAISS
+    if vectorstore is None:
+        vectorstore = FAISS.from_documents(new_docs, embed_model)
+        docs = new_docs
+        print(f"Successfully consumed {len(new_docs)} documents.")
+    else:
+        docs.extend(new_docs)
+        vectorstore.add_documents(new_docs)
+    save_local(args.vectorstore_dir, vectorstore, docs)
+    import json
+    with open(os.path.join(args.vectorstore_dir, "config.json"), "a") as f:
+        json.dump(vars(args), f)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    data_paths = [
+        'dataset/RAG_Data/wiki_vi',
+        'dataset/RAG_Data/youmed',
+        'dataset/RAG_Data/mimic_ex_report',
+        'dataset/RAG_Data/Download sach y/OCR',
+    ]
+    # Dataset params
+    parser.add_argument("--data_paths", type=List[str], required=False, default=data_paths)
+    parser.add_argument("--vectorstore_dir", type=str, required=False, default="notebook/An/master/knowledge/vectorstore_full")
+    parser.add_argument("--file_type", type=str, choices=["pdf", "txt"], default="txt")
+    # Model params
+    parser.add_argument("--embed_model_name", type=str, default="alibaba-nlp/gte-multilingual-base")
+    # Index params
+    parser.add_argument("--chunk_size", type=int, default=2048)
+    parser.add_argument("--chunk_overlap", type=int, default=512)
+    parser.add_argument("--chunk_method", type=str, choices=["recursive", "markdown"], default="markdown")
+    # Vectorstore params
+    parser.add_argument("--vectorstore", type=str, choices=["faiss", "chroma"], default="faiss")
+    parser.add_argument("--clear_vectorstore", action="store_true", default=True)
+    args = parser.parse_args()
+    main(args)

test/eval_lm.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import argparse
+from ..rag_pipeline import qa_prompt
+from ..rag_pipeline import ChatAssistant
+from ..utils import load_qa_dataset, load_prepared_retrieve_docs
+from typing import List, Optional
+from langchain.schema import Document
+def get_answer_from_response(llm_response: str) -> str:
+    return llm_response.strip()
+def build_qa_prompt(question: str, document: Optional[List[Document]]) -> str:
+    if document is not None:
+        document = '\n'.join([f"Document {i+1}:\n" + doc.page_content for i,doc in enumerate(document)])
+    return qa_prompt.format(question=question, document=document)
+def process_question(question, prompt, answer, id, args, llm):
+    llm_response = llm.get_response("", prompt)
+    # ans = get_answer_from_response(llm_response)
+    with open("log.txt", "a", encoding="utf-8") as f:
+        f.write(f"ID: {id}\n")
+        f.write(prompt)
+        f.write(f"LLM Response:\n{llm_response}\n")
+        f.write(f"Answer: {answer} \n\n")
+    # with open("log_score.txt", "a", encoding="utf-8") as f:
+    #     f.write("1" if ans == answer else "0")
+    # return 1 if ans == answer else 0
+    return llm_response
+def evaluate_qa(questions, prompts, answers, ids, args, llm):
+    import concurrent.futures
+    from tqdm import tqdm
+    ans = []
+    with concurrent.futures.ThreadPoolExecutor(max_workers=args.max_workers) as executor:
+        futures = [executor.submit(process_question, questions[i], prompts[i], answers[i], ids[i], args, llm) for i in range(len(questions))]
+        for future in tqdm(concurrent.futures.as_completed(futures), total=len(questions)):
+            ans.append(future.result())
+    return ans
+def main(args):
+    ids, questions, options, answers = load_qa_dataset(args.qa_file)
+    if ids is None:
+        raise ValueError(f"No id field in {args.qa_file}.")
+    if args.num_docs > 0:
+        if args.prepared_retrieve_docs_path is not None:
+            documents = load_prepared_retrieve_docs(args.prepared_retrieve_docs_path)
+            docs = [d[:args.num_docs] for i,d in enumerate(documents)]
+        else:
+            raise ValueError(f"No prepared retrieve docs found.")
+    else:
+        docs = [None]*len(questions)
+    prompts = [build_qa_prompt(questions[i], docs[i]) for i in range(len(questions))]
+    llm = ChatAssistant(args.model_name, args.provider)
+    with open("log_score.txt", "a", encoding="utf-8") as f:
+            f.write("\n")
+    qa_results = evaluate_qa(questions, prompts, answers, ids, args, llm)
+    qa_results = [qa_results[i][qa_results[i].rfind("[")+1:qa_results[i].rfind("]")] for i in range(len(qa_results))]
+    # print(f"{qa_results}")
+    import pyperclip
+    pyperclip.copy('\n'.join(qa_results))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--qa_file", type=str, default="dataset/QA Data/random.jsonl")
+    parser.add_argument("--prepared_retrieve_docs_path", type=str, default="prepared_retrieve_docs.pkl")
+    parser.add_argument("--model_name", type=str, default="mistral-medium")
+    parser.add_argument("--provider", type=str, default="mistral")
+    parser.add_argument("--max_workers", type=int, default=4)
+    parser.add_argument("--num_docs", type=int, default=0)
+    parser.add_argument("--dataset_path", type=str)
+    args = parser.parse_args()
+    print(args)
+    main(args)

test/eval_qa.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import argparse
+from ..rag_pipeline import multichoice_qa_prompt
+from ..rag_pipeline import ChatAssistant
+from ..utils import paralelize, load_qa_dataset, load_prepared_retrieve_docs
+from datetime import datetime
+from typing import List, Optional
+from langchain.schema import Document
+def get_answer_from_response(llm_response: str) -> chr:
+    """
+    Get the answer from the LLM response.
+    """
+    return llm_response[llm_response.lower().rfind("the answer is ") + 14]
+def build_multichoice_qa_prompt(question: str, options: str, document: Optional[List[Document]]) -> str:
+    """
+    Build the prompt for the multichoice QA task.
+    """
+    if document is not None:
+        document = '\n'.join([f"Document {i+1}:\n" + doc.page_content for i,doc in enumerate(document)])
+    return multichoice_qa_prompt.format(question=question, options=options, document=document)
+def process_question(question, prompt, answer, id, args, llm):
+    llm_response = ""
+    for j in range(args.retries):
+        try:
+            llm_response = llm.get_response("", prompt)
+            ans = get_answer_from_response(llm_response)
+            if ans in ["A", "B", "C", "D", "E"]:
+                with open("log.txt", "a", encoding="utf-8") as f:
+                    f.write(f"ID: {id}\n")
+                    f.write(prompt)
+                    f.write(f"LLM Response:\n{llm_response}\n")
+                    f.write(f"Answer: {answer}  {ans}\n\n")
+                break
+        except Exception as e:
+            print(f"Error: {e}")
+            ans = "#"
+    with open("log_score.txt", "a", encoding="utf-8") as f:
+        f.write("1" if ans == answer else "0")
+    return 1 if ans == answer else 0
+def evaluate_qa(questions, prompts, answers, ids, args, llm):
+    import concurrent.futures
+    from tqdm import tqdm
+    correct = 0
+    with concurrent.futures.ThreadPoolExecutor(max_workers=args.max_workers) as executor:
+        futures = [executor.submit(process_question, questions[i], prompts[i], answers[i], ids[i], args, llm) for i in range(len(questions))]
+        for future in tqdm(concurrent.futures.as_completed(futures), total=len(questions)):
+            correct += future.result()
+    return correct / len(questions)
+def main(args):
+    ids, questions, options, answers = load_qa_dataset(args.qa_file)
+    if ids is None:
+        raise ValueError(f"No id field in {args.qa_file}.")
+    if args.num_docs > 0:
+        if args.prepared_retrieve_docs_path is not None:
+            documents = load_prepared_retrieve_docs(args.prepared_retrieve_docs_path)
+            docs = [d[:args.num_docs] for i,d in enumerate(documents)]
+        else:
+            raise ValueError(f"No prepared retrieve docs found.")
+    else:
+        docs = [None]*len(questions)
+    prompts = [build_multichoice_qa_prompt(questions[i], options[i], docs[i]) for i in range(len(questions))]
+    # print(prompts[0])
+    llm = ChatAssistant(args.model_name, args.provider)
+    with open("log_score.txt", "a", encoding="utf-8") as f:
+            f.write(f"\n{datetime.now()} {args}\n")
+    acc = evaluate_qa(questions, prompts, answers, ids, args, llm)
+    print(f"Accuracy: {acc}")
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    # parser.add_argument("--qa_file", type=str, default="dataset/QA Data/MedAB/MedABv2.jsonl")
+    # parser.add_argument("--prepared_retrieve_docs_path", type=str, default="dataset/QA Data/MedAB/prepared_retrieve_docs_full.pkl")
+    parser.add_argument("--qa_file", type=str, default="dataset/QA Data/MedMCQA/translated_hard_questions.jsonl")
+    parser.add_argument("--prepared_retrieve_docs_path", type=str, default="dataset/QA Data/MedMCQA/prepared_retrieve_docs_full.pkl")
+    # Eval params
+    parser.add_argument("--model_name", type=str, default="mistral-medium")
+    parser.add_argument("--provider", type=str, default="mistral")
+    parser.add_argument("--max_workers", type=int, default=4)
+    parser.add_argument("--num_docs", type=int, default=0)
+    parser.add_argument("--retries", type=int, default=4)
+    # Dataset params
+    parser.add_argument("--dataset_path", type=str)
+    args = parser.parse_args()
+    print(f"Log:{args}")
+    main(args)

test/prepare_retrieve.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import argparse
+import os
+from ..rag_pipeline import get_embeddings, vretrieve
+from ..utils import load_local, load_qa_dataset, safe_save_langchain_docs
+def main(args):
+    embed_model = get_embeddings(args.embed_model_name, show_progress=False)
+    vectorstore, docs = load_local(args.vectorstore_dir, embed_model)
+    ids, questions, options, answers = load_qa_dataset(args.qa_data_path)
+    rag_queries = [f"Question: {questions[i]}\n{options[i]}" for i in range(len(questions))]
+    if (args.rag_queries_path is not None) and os.path.exists(args.rag_queries_path):
+        import json
+        with open(args.rag_queries_path, "r", encoding="utf-8") as f:
+            rag_queries = [json.loads(line)["query"] for line in f]
+    from tqdm import tqdm
+    retrieve_results = [vretrieve(rag_queries[i], vectorstore, docs, args.retriever_k, args.metric, args.threshold) for i in tqdm(range(len(rag_queries)), desc="Retrieving documents")]
+    safe_save_langchain_docs(retrieve_results, args.prepared_retrieve_docs_path)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    # Dataset params
+    parser.add_argument("--qa_data_path", type=str, default="dataset/QA Data/MedMCQA/translated_hard_questions.jsonl")
+    # Vectorstore params
+    parser.add_argument("--vectorstore_dir", type=str, default="notebook/An/master/knowledge/vectorstore_full")
+    parser.add_argument("--prepared_retrieve_docs_path", type=str, default="dataset/QA Data/MedMCQA/prepared_retrieve_docs_full.pkl")
+    parser.add_argument("--rag_queries_path", type=str, default=None)
+    # Model params
+    parser.add_argument("--embed_model_name", type=str, default="alibaba-nlp/gte-multilingual-base")
+    # Vectorstore retriever params
+    parser.add_argument("--vectorstore", type=str, choices=["faiss", "chroma"], default="faiss")
+    parser.add_argument("--metric", type=str, choices=["cosine", "mmr", "bm25"], default="mmr")
+    parser.add_argument("--retriever_k", type=int, default=20, help="Number of documents to retrieve")
+    parser.add_argument("--threshold", type=float, default=0.5, help="Threshold for cosine similarity")
+    parser.add_argument("--reranker_model_name", type=str, default=None)
+    parser.add_argument("--reranker_k", type=int, default=50, help="Number of documents to rerank")
+    args = parser.parse_args()
+    print(args)
+    main(args)

test/test_llm.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from ..rag_pipeline import ChatAssistant
+from ..rag_pipeline import request_retrieve_prompt
+cb = ChatAssistant("mistral-medium", "mistral")
+query = "Beta blocker for hypertension"
+query = request_retrieve_prompt.format(conversation=query, role="customer")
+response = cb.get_response(user=query)
+print(response)

test/test_retrieve.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import argparse
+import os
+from ..rag_pipeline import get_embeddings, rerank
+from ..utils import load_local
+from ..rag_pipeline import vretrieve
+def main(args):
+    embed_model = get_embeddings(args.embed_model_name)
+    vectorstore, docs = load_local(args.vectorstore_dir, embed_model)
+    retrieve_results = vretrieve(args.query, vectorstore, docs, args.retriever_k, args.metric, args.threshold)
+    retrieve_results = rerank(retrieve_results)
+    print(retrieve_results)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--query", type=str, required=False, default="What are the applications of beta blockers in the treatment of hypertension?")
+    # Vectorstore params
+    parser.add_argument("--vectorstore_dir", type=str, required=False, default="notebook/An/master/knowledge/vectorstore_full")
+    # Model params
+    parser.add_argument("--embed_model_name", type=str, default="alibaba-nlp/gte-multilingual-base")
+    # Vectorstore retriever params
+    parser.add_argument("--vectorstore", type=str, choices=["faiss", "chroma"], default="faiss")
+    parser.add_argument("--metric", type=str, choices=["cosine", "mmr", "bm25"], default="cosine")
+    parser.add_argument("--retriever_k", type=int, default=4, help="Number of documents to retrieve")
+    parser.add_argument("--threshold", type=float, default=0.7, help="Threshold for cosine similarity")
+    parser.add_argument("--reranker_model_name", type=str, default=None)
+    parser.add_argument("--reranker_k", type=int, default=20, help="Number of documents to rerank")
+    args = parser.parse_args()
+    main(args)

utils.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import os
+import pickle
+from typing import List, Optional
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.schema import Document
+def load_local(vectorstore_dir: str, embed_model: HuggingFaceEmbeddings) -> tuple[Optional[FAISS], Optional[List[Document]]]:
+    """
+    Load the vectorstore and documents from disk.
+    Args:
+        vectorstore_dir: The directory to load the vectorstore from.
+        embed_model: The embedding model to use.
+    Returns:
+        vector_store: The vectorstore.
+    """
+    from langchain_community.vectorstores import FAISS
+    if not os.path.isdir(vectorstore_dir):
+        print(f"Vectorstore directory not found at {vectorstore_dir}. Creating a new one.")
+        os.makedirs(vectorstore_dir, exist_ok=True)
+    try:
+        vector_store = FAISS.load_local(vectorstore_dir, embed_model, allow_dangerous_deserialization=True)
+        docs_path = os.path.join(vectorstore_dir, "docs.pkl")
+        if os.path.exists(docs_path):
+            with open(docs_path, "rb") as f:
+                docs = pickle.load(f)
+        else:
+            docs = None
+            print("Warning: docs.pkl not found. BM25 search will not be available.")
+        print(f"Successfully loaded RAG state from {vectorstore_dir}")
+        return vector_store, docs
+    except Exception as e:
+        print(f"Could not load from {vectorstore_dir}. It might be empty or corrupted. Error: {e}")
+        return None, None
+def save_local(vectorstore_dir: str, vectorstore: FAISS, docs: Optional[List[Document]]) -> None:
+    """
+    Save the vectorstore and documents to disk.
+    Args:
+        vectorstore_dir: The directory to save the vectorstore to.
+        vectorstore: The vectorstore to save.
+        docs: The documents to save.
+    """
+    if vectorstore is None:
+        raise ValueError("Nothing to save.")
+    if docs is None:
+        print("Warning: No documents to save. BM25 search will not be available.")
+    os.makedirs(vectorstore_dir, exist_ok=True)
+    vectorstore.save_local(vectorstore_dir)
+    if docs is not None:
+        with open(os.path.join(vectorstore_dir, "docs.pkl"), "wb") as f:
+            pickle.dump(docs, f)
+    print(f"Successfully saved RAG state to {vectorstore_dir}")
+def load_qa_dataset(qa_dataset_path: str) -> tuple[List[str], List[str], List[str], List[str]]:
+    """
+    Load the QA dataset. (jsonl)
+    Args:
+        qa_dataset_path: The path to the QA dataset.
+    Returns:
+        Tuple: (ids, questions, options, answers)\\
+        ids: The ids of the questions\\
+        questions: The questions\\
+        options: The options for each question\\
+        answers: The answers for each question.
+    """
+    import json
+    if not os.path.exists(qa_dataset_path):
+        raise FileNotFoundError(f"Error: File not found at {qa_dataset_path}")
+    with open(qa_dataset_path, "r", encoding="utf-8") as f:
+        data = [json.loads(line) for line in f]
+    questions = [item["question"] for item in data]
+    try:
+        options = [
+            (f"A. {item['A']} \n" if item['A'] not in [" ", "", None] else "") +
+            (f"B. {item['B']} \n" if item['B'] not in [" ", "", None] else "") +
+            (f"C. {item['C']} \n" if item['C'] not in [" ", "", None] else "") +
+            (f"D. {item['D']} \n" if item['D'] not in [" ", "", None] else "") +
+            (f"E. {item['E']} \n" if item['E'] not in [" ", "", None] else "")
+            for item in data]
+    except KeyError:
+        options = [" " for item in data]
+    answers = [item["answer"] for item in data]
+    uuids = [item["uuid"] for item in data]
+    return uuids, questions, options, answers
+def load_prepared_retrieve_docs(prepared_retrieve_docs_path: str) -> List[List[Document]]:
+    """
+    Load the prepared retrieve docs from a file.
+    Args:
+        prepared_retrieve_docs_path: The path to the prepared retrieve docs.
+    Returns:
+        A list of lists of documents.
+    """
+    return safe_load_langchain_docs(prepared_retrieve_docs_path)
+def paralelize(func, max_workers: int = 4, **kwargs) -> List:
+    """
+    Parallelizes a function call over multiple keyword argument iterables.
+    Args:
+        func: The function to execute in parallel.
+        max_workers: The maximum number of threads to use.
+        **kwargs: Keyword arguments where each value is an iterable (e.g., a list).
+                  All iterables must be of the same length.
+                  The keyword names do not matter, but their order does.
+    Returns:
+        A list of the results of the function calls.
+    """
+    from concurrent.futures import ThreadPoolExecutor
+    from tqdm import tqdm
+    if not kwargs:
+        return []
+    arg_lists = list(kwargs.values())
+    if len(set(len(lst) for lst in arg_lists)) > 1:
+        raise ValueError("All iterable arguments must have the same length.")
+    total_items = len(arg_lists[0])
+    iterable = zip(*arg_lists)
+    unpacker_func = lambda args_tuple: func(*args_tuple)
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        results = list(tqdm(executor.map(unpacker_func, iterable), total=total_items))
+    return results
+def safe_save_langchain_docs(documents: List[List[Document]], filepath: str):
+    """
+    Converts LangChain Document objects into a serializable list of dictionaries
+    and saves them to a file using pickle.
+    Args:
+        documents (List[List[Document]]): The nested list of LangChain Documents.
+        filepath (str): The path to the file where the data will be saved.
+    """
+    serializable_data = []
+    print(f"Preparing to save {len(documents)} lists of documents...")
+    # Convert each Document object into a dictionary
+    for doc_list in documents:
+        serializable_doc_list = []
+        for doc in doc_list:
+            serializable_doc_list.append({
+                "page_content": doc.page_content,
+                "metadata": doc.metadata,
+            })
+        serializable_data.append(serializable_doc_list)
+    print(f"Conversion complete. Saving to {filepath}...")
+    try:
+        # Use 'with' to ensure the file is closed properly, even if errors occur
+        with open(filepath, "wb") as f:
+            pickle.dump(serializable_data, f)
+        print("File saved successfully.")
+    except Exception as e:
+        print(f"An error occurred while saving the file: {e}")
+def safe_load_langchain_docs(filepath: str) -> List[List[Document]]:
+    """
+    Loads data from a pickle file and reconstructs the LangChain Document objects.
+    Args:
+        filepath (str): The path to the file to load.
+    Returns:
+        List[List[Document]]: The reconstructed nested list of LangChain Documents.
+    """
+    reconstructed_documents = []
+    print(f"Loading data from {filepath}...")
+    try:
+        with open(filepath, "rb") as f:
+            loaded_data = pickle.load(f)
+        print("File loaded successfully. Reconstructing Document objects...")
+        # Reconstruct the Document objects from the dictionaries
+        for doc_list_data in loaded_data:
+            reconstructed_doc_list = []
+            for doc_data in doc_list_data:
+                reconstructed_doc_list.append(
+                    Document(
+                        page_content=doc_data["page_content"],
+                        metadata=doc_data["metadata"]
+                    )
+                )
+            reconstructed_documents.append(reconstructed_doc_list)
+        print("Document objects reconstructed successfully.")
+        return reconstructed_documents
+    except FileNotFoundError:
+        print(f"Error: The file at {filepath} was not found.")
+        return []
+    except EOFError:
+        print(f"Error: The file at {filepath} is corrupted or incomplete (EOFError).")
+        print("Please re-run the script that generates this file.")
+        return []
+    except Exception as e:
+        print(f"An unexpected error occurred while loading the file: {e}")
+        return []