# import packages import numpy as np import polars as pl from sklearn.metrics.pairwise import cosine_similarity from transformers import AutoTokenizer, AutoModel import glob import torch # def encode(sentences, tokenizer, model, device="mps"): inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(device = device) with torch.no_grad(): outputs = model(**inputs) # outputs.last_hidden_state = [batch, tokens, hidden_dim] # mean pooling embeddings = outputs.last_hidden_state.mean(dim=1) return(embeddings) # define the device where torch calculations take place my_device = "cpu" # Instantiate the sentence-transformer model: model_name = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" sentence_tokenizer = AutoTokenizer.from_pretrained(model_name) sentence_model = AutoModel.from_pretrained(model_name).to(device = my_device) block_embeddings_df = pl.read_parquet("outputs/block_embeddings_df.parquet.zstd") def sbert_query(query, corpus_embeddings_df): query_embeddings = encode(query, tokenizer = sentence_tokenizer, model = sentence_model, device=my_device).cpu().numpy() sbert_scores = cosine_similarity(query_embeddings, corpus_embeddings_df.select(pl.exclude(['file', 'doc_block_indx']))) sorted_df = pl.DataFrame( { 'score': np.reshape(sbert_scores, shape=-1), 'file': corpus_embeddings_df['file'], 'doc_block_indx': corpus_embeddings_df['doc_block_indx'] }).group_by("file").agg(pl.col("score").max()) #top_df['file'][0] return(sorted_df.sort("score", descending = True).with_columns(pl.Series("rank-sbert", [i + 1 for i in range(sorted_df.shape[0])]))) def sbert_query_factory(corpus_embeddings_df): def do_sbert_query(my_query): return sbert_query(my_query, corpus_embeddings_df) return do_sbert_query # create a function to run the SBERT queries sbert_query_docs = sbert_query_factory(block_embeddings_df) query = "plans for raising grant revenue directed to the libraries" res_sbert = sbert_query_docs(query) #res.group_by("file").agg(pl.col("rank").min(), pl.col("score").max()).sort("rank")