Spaces:
Running
on
L4
Running
on
L4
Upload folder using huggingface_hub
Browse files- README.md +10 -0
- prepare_feed_deploy.py +59 -46
- requirements.txt +84 -2
- vespa_feed_to_hf_dataset.py +42 -0
README.md
CHANGED
|
@@ -126,6 +126,16 @@ python main.py
|
|
| 126 |
|
| 127 |
## Deploy to huggingface π€
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
To deploy, run
|
| 130 |
|
| 131 |
```bash
|
|
|
|
| 126 |
|
| 127 |
## Deploy to huggingface π€
|
| 128 |
|
| 129 |
+
### Compiling dependencies
|
| 130 |
+
|
| 131 |
+
Before a deploy, make sure to run this to compile the `uv` lock file to `requirements.txt` if you have made changes to the dependencies:
|
| 132 |
+
|
| 133 |
+
```bash
|
| 134 |
+
uv pip compile pyproject.toml -o requirements.txt
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
### Deploying to huggingface
|
| 138 |
+
|
| 139 |
To deploy, run
|
| 140 |
|
| 141 |
```bash
|
prepare_feed_deploy.py
CHANGED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
# %% [markdown]
|
| 2 |
# # Visual PDF Retrieval - demo application
|
| 3 |
-
#
|
| 4 |
# In this notebook, we will prepare the Vespa backend application for our visual retrieval demo.
|
| 5 |
# We will use ColPali as the model to extract patch vectors from images of pdf pages.
|
| 6 |
# At query time, we use MaxSim to retrieve and/or (based on the configuration) rank the page results.
|
| 7 |
-
#
|
| 8 |
# To see the application in action, visit TODO:
|
| 9 |
-
#
|
| 10 |
# The web application is written in FastHTML, meaning the complete application is written in python.
|
| 11 |
-
#
|
| 12 |
# The steps we will take in this notebook are:
|
| 13 |
-
#
|
| 14 |
# 0. Setup and configuration
|
| 15 |
# 1. Download the data
|
| 16 |
# 2. Prepare the data
|
|
@@ -18,14 +18,14 @@
|
|
| 18 |
# 4. Deploy the Vespa application
|
| 19 |
# 5. Create the Vespa application
|
| 20 |
# 6. Feed the data to the Vespa application
|
| 21 |
-
#
|
| 22 |
# All the steps that are needed to provision the Vespa application, including feeding the data, can be done from this notebook.
|
| 23 |
# We have tried to make it easy for others to run this notebook, to create your own PDF Enterprise Search application using Vespa.
|
| 24 |
-
#
|
| 25 |
|
| 26 |
# %% [markdown]
|
| 27 |
# ## 0. Setup and Configuration
|
| 28 |
-
#
|
| 29 |
|
| 30 |
# %%
|
| 31 |
import os
|
|
@@ -83,11 +83,11 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
| 83 |
|
| 84 |
# %% [markdown]
|
| 85 |
# ### Create a free trial in Vespa Cloud
|
| 86 |
-
#
|
| 87 |
# Create a tenant from [here](https://vespa.ai/free-trial/).
|
| 88 |
# The trial includes $300 credit.
|
| 89 |
# Take note of your tenant name.
|
| 90 |
-
#
|
| 91 |
|
| 92 |
# %%
|
| 93 |
VESPA_TENANT_NAME = "vespa-team"
|
|
@@ -95,17 +95,17 @@ VESPA_TENANT_NAME = "vespa-team"
|
|
| 95 |
# %% [markdown]
|
| 96 |
# Here, set your desired application name. (Will be created in later steps)
|
| 97 |
# Note that you can not have hyphen `-` or underscore `_` in the application name.
|
| 98 |
-
#
|
| 99 |
|
| 100 |
# %%
|
| 101 |
-
VESPA_APPLICATION_NAME = "
|
| 102 |
VESPA_SCHEMA_NAME = "pdf_page"
|
| 103 |
|
| 104 |
# %% [markdown]
|
| 105 |
# Next, you need to create some tokens for feeding data, and querying the application.
|
| 106 |
# We recommend separate tokens for feeding and querying, (the former with write permission, and the latter with read permission).
|
| 107 |
# The tokens can be created from the [Vespa Cloud console](https://console.vespa-cloud.com/) in the 'Account' -> 'Tokens' section.
|
| 108 |
-
#
|
| 109 |
|
| 110 |
# %%
|
| 111 |
VESPA_TOKEN_ID_WRITE = "colpalidemo_write"
|
|
@@ -113,7 +113,7 @@ VESPA_TOKEN_ID_READ = "colpalidemo_read"
|
|
| 113 |
|
| 114 |
# %% [markdown]
|
| 115 |
# We also need to set the value of the write token to be able to feed data to the Vespa application.
|
| 116 |
-
#
|
| 117 |
|
| 118 |
# %%
|
| 119 |
VESPA_CLOUD_SECRET_TOKEN = os.getenv("VESPA_CLOUD_SECRET_TOKEN") or input(
|
|
@@ -124,7 +124,7 @@ VESPA_CLOUD_SECRET_TOKEN = os.getenv("VESPA_CLOUD_SECRET_TOKEN") or input(
|
|
| 124 |
# We will also use the Gemini API to create sample queries for our images.
|
| 125 |
# You can also use other VLM's to create these queries.
|
| 126 |
# Create a Gemini API key from [here](https://aistudio.google.com/app/apikey).
|
| 127 |
-
#
|
| 128 |
|
| 129 |
# %%
|
| 130 |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or input(
|
|
@@ -152,21 +152,21 @@ processor = ColPaliProcessor.from_pretrained(MODEL_NAME)
|
|
| 152 |
|
| 153 |
# %% [markdown]
|
| 154 |
# ## 1. Download PDFs
|
| 155 |
-
#
|
| 156 |
# We are going to use public reports from the Norwegian Government Pension Fund Global (also known as the Oil Fund).
|
| 157 |
# The fund puts transparency at the forefront and publishes reports on its investments, holdings, and returns, as well as its strategy and governance.
|
| 158 |
-
#
|
| 159 |
# These reports are the ones we are going to use for this showcase.
|
| 160 |
# Here are some sample images:
|
| 161 |
-
#
|
| 162 |
# 
|
| 163 |
# 
|
| 164 |
-
#
|
| 165 |
|
| 166 |
# %% [markdown]
|
| 167 |
# As we can see, a lot of the information is in the form of tables, charts and numbers.
|
| 168 |
# These are not easily extractable using pdf-readers or OCR tools.
|
| 169 |
-
#
|
| 170 |
|
| 171 |
# %%
|
| 172 |
import requests
|
|
@@ -180,16 +180,20 @@ html_content = response.text
|
|
| 180 |
soup = BeautifulSoup(html_content, "html.parser")
|
| 181 |
|
| 182 |
links = []
|
|
|
|
| 183 |
|
| 184 |
-
# Find all
|
| 185 |
-
for
|
| 186 |
-
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
| 188 |
href = a_tag["href"]
|
| 189 |
full_url = urljoin(url, href)
|
| 190 |
links.append(full_url)
|
| 191 |
-
|
| 192 |
-
links
|
| 193 |
|
| 194 |
# %%
|
| 195 |
# Limit the number of PDFs to download
|
|
@@ -274,7 +278,8 @@ pdfs
|
|
| 274 |
|
| 275 |
# %% [markdown]
|
| 276 |
# ## 2. Convert PDFs to Images
|
| 277 |
-
#
|
|
|
|
| 278 |
|
| 279 |
# %%
|
| 280 |
def get_pdf_images(pdf_path):
|
|
@@ -300,6 +305,7 @@ for pdf in tqdm(pdfs):
|
|
| 300 |
pdf_pages.append(
|
| 301 |
{
|
| 302 |
"title": title,
|
|
|
|
| 303 |
"url": pdf["url"],
|
| 304 |
"path": pdf_file,
|
| 305 |
"image": image,
|
|
@@ -324,17 +330,17 @@ print(f"Number of text with length == 0: {Counter(text_lengths)[0]}")
|
|
| 324 |
|
| 325 |
# %% [markdown]
|
| 326 |
# ## 3. Generate Queries
|
| 327 |
-
#
|
| 328 |
# In this step, we want to generate queries for each page image.
|
| 329 |
# These will be useful for 2 reasons:
|
| 330 |
-
#
|
| 331 |
# 1. We can use these queries as typeahead suggestions in the search bar.
|
| 332 |
# 2. We can use the queries to generate an evaluation dataset. See [Improving Retrieval with LLM-as-a-judge](https://blog.vespa.ai/improving-retrieval-with-llm-as-a-judge/) for a deeper dive into this topic.
|
| 333 |
-
#
|
| 334 |
# The prompt for generating queries is taken from [this](https://danielvanstrien.xyz/posts/post-with-code/colpali/2024-09-23-generate_colpali_dataset.html#an-update-retrieval-focused-prompt) wonderful blog post by Daniel van Strien.
|
| 335 |
-
#
|
| 336 |
# We will use the Gemini API to generate these queries, with `gemini-1.5-flash-8b` as the model.
|
| 337 |
-
#
|
| 338 |
|
| 339 |
# %%
|
| 340 |
from pydantic import BaseModel
|
|
@@ -413,6 +419,7 @@ def generate_queries(image, prompt_text, pydantic_model):
|
|
| 413 |
}
|
| 414 |
return queries
|
| 415 |
|
|
|
|
| 416 |
# %%
|
| 417 |
for pdf in tqdm(pdf_pages):
|
| 418 |
image = pdf.get("image")
|
|
@@ -488,9 +495,10 @@ with open("output/pdf_pages.json", "w") as f:
|
|
| 488 |
|
| 489 |
# %% [markdown]
|
| 490 |
# ## 4. Generate embeddings
|
| 491 |
-
#
|
| 492 |
# Now that we have the queries, we can use the ColPali model to generate embeddings for each page image.
|
| 493 |
-
#
|
|
|
|
| 494 |
|
| 495 |
# %%
|
| 496 |
def generate_embeddings(images, model, processor, batch_size=2) -> np.ndarray:
|
|
@@ -530,6 +538,7 @@ def generate_embeddings(images, model, processor, batch_size=2) -> np.ndarray:
|
|
| 530 |
all_embeddings = np.concatenate(embeddings_list, axis=0)
|
| 531 |
return all_embeddings
|
| 532 |
|
|
|
|
| 533 |
# %%
|
| 534 |
# Generate embeddings for all images
|
| 535 |
images = [pdf["image"] for pdf in pdf_pages]
|
|
@@ -540,9 +549,10 @@ embeddings.shape
|
|
| 540 |
|
| 541 |
# %% [markdown]
|
| 542 |
# ## 5. Prepare Data on Vespa Format
|
| 543 |
-
#
|
| 544 |
# Now, that we have all the data we need, all that remains is to make sure it is in the right format for Vespa.
|
| 545 |
-
#
|
|
|
|
| 546 |
|
| 547 |
# %%
|
| 548 |
def float_to_binary_embedding(float_query_embedding: dict) -> dict:
|
|
@@ -555,10 +565,12 @@ def float_to_binary_embedding(float_query_embedding: dict) -> dict:
|
|
| 555 |
binary_query_embeddings[k] = binary_vector
|
| 556 |
return binary_query_embeddings
|
| 557 |
|
|
|
|
| 558 |
# %%
|
| 559 |
vespa_feed = []
|
| 560 |
for pdf, embedding in zip(pdf_pages, embeddings):
|
| 561 |
url = pdf["url"]
|
|
|
|
| 562 |
title = pdf["title"]
|
| 563 |
image = pdf["image"]
|
| 564 |
text = pdf.get("text", "")
|
|
@@ -580,6 +592,7 @@ for pdf, embedding in zip(pdf_pages, embeddings):
|
|
| 580 |
"id": id_hash,
|
| 581 |
"url": url,
|
| 582 |
"title": title,
|
|
|
|
| 583 |
"page_number": page_no,
|
| 584 |
"blur_image": base_64_image,
|
| 585 |
"full_image": base_64_full_image,
|
|
@@ -616,7 +629,7 @@ len(vespa_feed)
|
|
| 616 |
|
| 617 |
# %% [markdown]
|
| 618 |
# ## 5. Prepare Vespa Application
|
| 619 |
-
#
|
| 620 |
|
| 621 |
# %%
|
| 622 |
# Define the Vespa schema
|
|
@@ -631,6 +644,7 @@ colpali_schema = Schema(
|
|
| 631 |
match=["word"],
|
| 632 |
),
|
| 633 |
Field(name="url", type="string", indexing=["summary", "index"]),
|
|
|
|
| 634 |
Field(
|
| 635 |
name="title",
|
| 636 |
type="string",
|
|
@@ -720,9 +734,7 @@ colpali_schema = Schema(
|
|
| 720 |
DocumentSummary(
|
| 721 |
name="suggestions",
|
| 722 |
summary_fields=[
|
| 723 |
-
Summary(
|
| 724 |
-
name="questions"
|
| 725 |
-
),
|
| 726 |
],
|
| 727 |
from_disk=True,
|
| 728 |
),
|
|
@@ -756,11 +768,12 @@ mapfunctions = [
|
|
| 756 |
# Define the 'bm25' rank profile
|
| 757 |
colpali_bm25_profile = RankProfile(
|
| 758 |
name="bm25",
|
| 759 |
-
inputs=[("query(qt)", "tensor<float>(querytoken{}, v[128])")],
|
| 760 |
first_phase="bm25(title) + bm25(text)",
|
| 761 |
functions=mapfunctions,
|
| 762 |
)
|
| 763 |
|
|
|
|
| 764 |
# A function to create an inherited rank profile which also returns quantized similarity scores
|
| 765 |
def with_quantized_similarity(rank_profile: RankProfile) -> RankProfile:
|
| 766 |
return RankProfile(
|
|
@@ -770,6 +783,7 @@ def with_quantized_similarity(rank_profile: RankProfile) -> RankProfile:
|
|
| 770 |
summary_features=["quantized"],
|
| 771 |
)
|
| 772 |
|
|
|
|
| 773 |
colpali_schema.add_rank_profile(colpali_bm25_profile)
|
| 774 |
colpali_schema.add_rank_profile(with_quantized_similarity(colpali_bm25_profile))
|
| 775 |
|
|
@@ -941,7 +955,7 @@ vespa_application_package = ApplicationPackage(
|
|
| 941 |
|
| 942 |
# %% [markdown]
|
| 943 |
# ## 6. Deploy Vespa Application
|
| 944 |
-
#
|
| 945 |
|
| 946 |
# %%
|
| 947 |
VESPA_TEAM_API_KEY = os.getenv("VESPA_TEAM_API_KEY") or input(
|
|
@@ -966,17 +980,18 @@ print(f"Application deployed. Token endpoint URL: {endpoint_url}")
|
|
| 966 |
# %% [markdown]
|
| 967 |
# Make sure to take note of the token endpoint_url.
|
| 968 |
# You need to put this in your `.env` file - `VESPA_APP_URL=https://abcd.vespa-app.cloud` - to access the Vespa application from your web application.
|
| 969 |
-
#
|
| 970 |
|
| 971 |
# %% [markdown]
|
| 972 |
# ## 8. Feed Data to Vespa
|
| 973 |
-
#
|
| 974 |
|
| 975 |
# %%
|
| 976 |
# Instantiate Vespa connection using token
|
| 977 |
app = Vespa(url=endpoint_url, vespa_cloud_secret_token=VESPA_CLOUD_SECRET_TOKEN)
|
| 978 |
app.get_application_status()
|
| 979 |
|
|
|
|
| 980 |
# %%
|
| 981 |
def callback(response: VespaResponse, id: str):
|
| 982 |
if not response.is_successful():
|
|
@@ -987,5 +1002,3 @@ def callback(response: VespaResponse, id: str):
|
|
| 987 |
|
| 988 |
# Feed data into Vespa asynchronously
|
| 989 |
app.feed_async_iterable(vespa_feed, schema=VESPA_SCHEMA_NAME, callback=callback)
|
| 990 |
-
|
| 991 |
-
|
|
|
|
| 1 |
# %% [markdown]
|
| 2 |
# # Visual PDF Retrieval - demo application
|
| 3 |
+
#
|
| 4 |
# In this notebook, we will prepare the Vespa backend application for our visual retrieval demo.
|
| 5 |
# We will use ColPali as the model to extract patch vectors from images of pdf pages.
|
| 6 |
# At query time, we use MaxSim to retrieve and/or (based on the configuration) rank the page results.
|
| 7 |
+
#
|
| 8 |
# To see the application in action, visit TODO:
|
| 9 |
+
#
|
| 10 |
# The web application is written in FastHTML, meaning the complete application is written in python.
|
| 11 |
+
#
|
| 12 |
# The steps we will take in this notebook are:
|
| 13 |
+
#
|
| 14 |
# 0. Setup and configuration
|
| 15 |
# 1. Download the data
|
| 16 |
# 2. Prepare the data
|
|
|
|
| 18 |
# 4. Deploy the Vespa application
|
| 19 |
# 5. Create the Vespa application
|
| 20 |
# 6. Feed the data to the Vespa application
|
| 21 |
+
#
|
| 22 |
# All the steps that are needed to provision the Vespa application, including feeding the data, can be done from this notebook.
|
| 23 |
# We have tried to make it easy for others to run this notebook, to create your own PDF Enterprise Search application using Vespa.
|
| 24 |
+
#
|
| 25 |
|
| 26 |
# %% [markdown]
|
| 27 |
# ## 0. Setup and Configuration
|
| 28 |
+
#
|
| 29 |
|
| 30 |
# %%
|
| 31 |
import os
|
|
|
|
| 83 |
|
| 84 |
# %% [markdown]
|
| 85 |
# ### Create a free trial in Vespa Cloud
|
| 86 |
+
#
|
| 87 |
# Create a tenant from [here](https://vespa.ai/free-trial/).
|
| 88 |
# The trial includes $300 credit.
|
| 89 |
# Take note of your tenant name.
|
| 90 |
+
#
|
| 91 |
|
| 92 |
# %%
|
| 93 |
VESPA_TENANT_NAME = "vespa-team"
|
|
|
|
| 95 |
# %% [markdown]
|
| 96 |
# Here, set your desired application name. (Will be created in later steps)
|
| 97 |
# Note that you can not have hyphen `-` or underscore `_` in the application name.
|
| 98 |
+
#
|
| 99 |
|
| 100 |
# %%
|
| 101 |
+
VESPA_APPLICATION_NAME = "colpalidemo"
|
| 102 |
VESPA_SCHEMA_NAME = "pdf_page"
|
| 103 |
|
| 104 |
# %% [markdown]
|
| 105 |
# Next, you need to create some tokens for feeding data, and querying the application.
|
| 106 |
# We recommend separate tokens for feeding and querying, (the former with write permission, and the latter with read permission).
|
| 107 |
# The tokens can be created from the [Vespa Cloud console](https://console.vespa-cloud.com/) in the 'Account' -> 'Tokens' section.
|
| 108 |
+
#
|
| 109 |
|
| 110 |
# %%
|
| 111 |
VESPA_TOKEN_ID_WRITE = "colpalidemo_write"
|
|
|
|
| 113 |
|
| 114 |
# %% [markdown]
|
| 115 |
# We also need to set the value of the write token to be able to feed data to the Vespa application.
|
| 116 |
+
#
|
| 117 |
|
| 118 |
# %%
|
| 119 |
VESPA_CLOUD_SECRET_TOKEN = os.getenv("VESPA_CLOUD_SECRET_TOKEN") or input(
|
|
|
|
| 124 |
# We will also use the Gemini API to create sample queries for our images.
|
| 125 |
# You can also use other VLM's to create these queries.
|
| 126 |
# Create a Gemini API key from [here](https://aistudio.google.com/app/apikey).
|
| 127 |
+
#
|
| 128 |
|
| 129 |
# %%
|
| 130 |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or input(
|
|
|
|
| 152 |
|
| 153 |
# %% [markdown]
|
| 154 |
# ## 1. Download PDFs
|
| 155 |
+
#
|
| 156 |
# We are going to use public reports from the Norwegian Government Pension Fund Global (also known as the Oil Fund).
|
| 157 |
# The fund puts transparency at the forefront and publishes reports on its investments, holdings, and returns, as well as its strategy and governance.
|
| 158 |
+
#
|
| 159 |
# These reports are the ones we are going to use for this showcase.
|
| 160 |
# Here are some sample images:
|
| 161 |
+
#
|
| 162 |
# 
|
| 163 |
# 
|
| 164 |
+
#
|
| 165 |
|
| 166 |
# %% [markdown]
|
| 167 |
# As we can see, a lot of the information is in the form of tables, charts and numbers.
|
| 168 |
# These are not easily extractable using pdf-readers or OCR tools.
|
| 169 |
+
#
|
| 170 |
|
| 171 |
# %%
|
| 172 |
import requests
|
|
|
|
| 180 |
soup = BeautifulSoup(html_content, "html.parser")
|
| 181 |
|
| 182 |
links = []
|
| 183 |
+
url_to_year = {}
|
| 184 |
|
| 185 |
+
# Find all 'div's with id starting with 'year-'
|
| 186 |
+
for year_div in soup.find_all("div", id=lambda x: x and x.startswith("year-")):
|
| 187 |
+
year_id = year_div.get("id", "")
|
| 188 |
+
year = year_id.replace("year-", "")
|
| 189 |
+
|
| 190 |
+
# Within this div, find all 'a' elements with the specific classes
|
| 191 |
+
for a_tag in year_div.select("a.button.button--download-secondary[href]"):
|
| 192 |
href = a_tag["href"]
|
| 193 |
full_url = urljoin(url, href)
|
| 194 |
links.append(full_url)
|
| 195 |
+
url_to_year[full_url] = year
|
| 196 |
+
links, url_to_year
|
| 197 |
|
| 198 |
# %%
|
| 199 |
# Limit the number of PDFs to download
|
|
|
|
| 278 |
|
| 279 |
# %% [markdown]
|
| 280 |
# ## 2. Convert PDFs to Images
|
| 281 |
+
#
|
| 282 |
+
|
| 283 |
|
| 284 |
# %%
|
| 285 |
def get_pdf_images(pdf_path):
|
|
|
|
| 305 |
pdf_pages.append(
|
| 306 |
{
|
| 307 |
"title": title,
|
| 308 |
+
"year": int(url_to_year[pdf["url"]]),
|
| 309 |
"url": pdf["url"],
|
| 310 |
"path": pdf_file,
|
| 311 |
"image": image,
|
|
|
|
| 330 |
|
| 331 |
# %% [markdown]
|
| 332 |
# ## 3. Generate Queries
|
| 333 |
+
#
|
| 334 |
# In this step, we want to generate queries for each page image.
|
| 335 |
# These will be useful for 2 reasons:
|
| 336 |
+
#
|
| 337 |
# 1. We can use these queries as typeahead suggestions in the search bar.
|
| 338 |
# 2. We can use the queries to generate an evaluation dataset. See [Improving Retrieval with LLM-as-a-judge](https://blog.vespa.ai/improving-retrieval-with-llm-as-a-judge/) for a deeper dive into this topic.
|
| 339 |
+
#
|
| 340 |
# The prompt for generating queries is taken from [this](https://danielvanstrien.xyz/posts/post-with-code/colpali/2024-09-23-generate_colpali_dataset.html#an-update-retrieval-focused-prompt) wonderful blog post by Daniel van Strien.
|
| 341 |
+
#
|
| 342 |
# We will use the Gemini API to generate these queries, with `gemini-1.5-flash-8b` as the model.
|
| 343 |
+
#
|
| 344 |
|
| 345 |
# %%
|
| 346 |
from pydantic import BaseModel
|
|
|
|
| 419 |
}
|
| 420 |
return queries
|
| 421 |
|
| 422 |
+
|
| 423 |
# %%
|
| 424 |
for pdf in tqdm(pdf_pages):
|
| 425 |
image = pdf.get("image")
|
|
|
|
| 495 |
|
| 496 |
# %% [markdown]
|
| 497 |
# ## 4. Generate embeddings
|
| 498 |
+
#
|
| 499 |
# Now that we have the queries, we can use the ColPali model to generate embeddings for each page image.
|
| 500 |
+
#
|
| 501 |
+
|
| 502 |
|
| 503 |
# %%
|
| 504 |
def generate_embeddings(images, model, processor, batch_size=2) -> np.ndarray:
|
|
|
|
| 538 |
all_embeddings = np.concatenate(embeddings_list, axis=0)
|
| 539 |
return all_embeddings
|
| 540 |
|
| 541 |
+
|
| 542 |
# %%
|
| 543 |
# Generate embeddings for all images
|
| 544 |
images = [pdf["image"] for pdf in pdf_pages]
|
|
|
|
| 549 |
|
| 550 |
# %% [markdown]
|
| 551 |
# ## 5. Prepare Data on Vespa Format
|
| 552 |
+
#
|
| 553 |
# Now, that we have all the data we need, all that remains is to make sure it is in the right format for Vespa.
|
| 554 |
+
#
|
| 555 |
+
|
| 556 |
|
| 557 |
# %%
|
| 558 |
def float_to_binary_embedding(float_query_embedding: dict) -> dict:
|
|
|
|
| 565 |
binary_query_embeddings[k] = binary_vector
|
| 566 |
return binary_query_embeddings
|
| 567 |
|
| 568 |
+
|
| 569 |
# %%
|
| 570 |
vespa_feed = []
|
| 571 |
for pdf, embedding in zip(pdf_pages, embeddings):
|
| 572 |
url = pdf["url"]
|
| 573 |
+
year = pdf["year"]
|
| 574 |
title = pdf["title"]
|
| 575 |
image = pdf["image"]
|
| 576 |
text = pdf.get("text", "")
|
|
|
|
| 592 |
"id": id_hash,
|
| 593 |
"url": url,
|
| 594 |
"title": title,
|
| 595 |
+
"year": year,
|
| 596 |
"page_number": page_no,
|
| 597 |
"blur_image": base_64_image,
|
| 598 |
"full_image": base_64_full_image,
|
|
|
|
| 629 |
|
| 630 |
# %% [markdown]
|
| 631 |
# ## 5. Prepare Vespa Application
|
| 632 |
+
#
|
| 633 |
|
| 634 |
# %%
|
| 635 |
# Define the Vespa schema
|
|
|
|
| 644 |
match=["word"],
|
| 645 |
),
|
| 646 |
Field(name="url", type="string", indexing=["summary", "index"]),
|
| 647 |
+
Field(name="year", type="int", indexing=["summary", "attribute"]),
|
| 648 |
Field(
|
| 649 |
name="title",
|
| 650 |
type="string",
|
|
|
|
| 734 |
DocumentSummary(
|
| 735 |
name="suggestions",
|
| 736 |
summary_fields=[
|
| 737 |
+
Summary(name="questions"),
|
|
|
|
|
|
|
| 738 |
],
|
| 739 |
from_disk=True,
|
| 740 |
),
|
|
|
|
| 768 |
# Define the 'bm25' rank profile
|
| 769 |
colpali_bm25_profile = RankProfile(
|
| 770 |
name="bm25",
|
| 771 |
+
inputs=[("query(qt)", "tensor<float>(querytoken{}, v[128])")],
|
| 772 |
first_phase="bm25(title) + bm25(text)",
|
| 773 |
functions=mapfunctions,
|
| 774 |
)
|
| 775 |
|
| 776 |
+
|
| 777 |
# A function to create an inherited rank profile which also returns quantized similarity scores
|
| 778 |
def with_quantized_similarity(rank_profile: RankProfile) -> RankProfile:
|
| 779 |
return RankProfile(
|
|
|
|
| 783 |
summary_features=["quantized"],
|
| 784 |
)
|
| 785 |
|
| 786 |
+
|
| 787 |
colpali_schema.add_rank_profile(colpali_bm25_profile)
|
| 788 |
colpali_schema.add_rank_profile(with_quantized_similarity(colpali_bm25_profile))
|
| 789 |
|
|
|
|
| 955 |
|
| 956 |
# %% [markdown]
|
| 957 |
# ## 6. Deploy Vespa Application
|
| 958 |
+
#
|
| 959 |
|
| 960 |
# %%
|
| 961 |
VESPA_TEAM_API_KEY = os.getenv("VESPA_TEAM_API_KEY") or input(
|
|
|
|
| 980 |
# %% [markdown]
|
| 981 |
# Make sure to take note of the token endpoint_url.
|
| 982 |
# You need to put this in your `.env` file - `VESPA_APP_URL=https://abcd.vespa-app.cloud` - to access the Vespa application from your web application.
|
| 983 |
+
#
|
| 984 |
|
| 985 |
# %% [markdown]
|
| 986 |
# ## 8. Feed Data to Vespa
|
| 987 |
+
#
|
| 988 |
|
| 989 |
# %%
|
| 990 |
# Instantiate Vespa connection using token
|
| 991 |
app = Vespa(url=endpoint_url, vespa_cloud_secret_token=VESPA_CLOUD_SECRET_TOKEN)
|
| 992 |
app.get_application_status()
|
| 993 |
|
| 994 |
+
|
| 995 |
# %%
|
| 996 |
def callback(response: VespaResponse, id: str):
|
| 997 |
if not response.is_successful():
|
|
|
|
| 1002 |
|
| 1003 |
# Feed data into Vespa asynchronously
|
| 1004 |
app.feed_async_iterable(vespa_feed, schema=VESPA_SCHEMA_NAME, callback=callback)
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -24,8 +24,15 @@ attrs==24.2.0
|
|
| 24 |
# via aiohttp
|
| 25 |
beautifulsoup4==4.12.3
|
| 26 |
# via python-fasthtml
|
|
|
|
|
|
|
| 27 |
cachetools==5.5.0
|
| 28 |
# via google-auth
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
certifi==2024.8.30
|
| 30 |
# via
|
| 31 |
# httpcore
|
|
@@ -39,16 +46,27 @@ click==8.1.7
|
|
| 39 |
# via
|
| 40 |
# typer
|
| 41 |
# uvicorn
|
|
|
|
|
|
|
| 42 |
colpali-engine==0.3.1
|
| 43 |
# via
|
| 44 |
# visual-retrieval-colpali (pyproject.toml)
|
| 45 |
# vidore-benchmark
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
contourpy==1.3.0
|
| 47 |
# via matplotlib
|
| 48 |
cryptography==43.0.1
|
| 49 |
# via pyvespa
|
| 50 |
cycler==0.12.1
|
| 51 |
# via matplotlib
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
datasets==2.21.0
|
| 53 |
# via
|
| 54 |
# mteb
|
|
@@ -168,11 +186,16 @@ itsdangerous==2.2.0
|
|
| 168 |
jinja2==3.1.4
|
| 169 |
# via
|
| 170 |
# pyvespa
|
|
|
|
| 171 |
# torch
|
| 172 |
joblib==1.4.2
|
| 173 |
# via scikit-learn
|
| 174 |
kiwisolver==1.4.7
|
| 175 |
# via matplotlib
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
loguru==0.7.2
|
| 177 |
# via vidore-benchmark
|
| 178 |
lucide-fasthtml==0.0.9
|
|
@@ -181,6 +204,8 @@ lxml==5.3.0
|
|
| 181 |
# via
|
| 182 |
# lucide-fasthtml
|
| 183 |
# pyvespa
|
|
|
|
|
|
|
| 184 |
markdown-it-py==3.0.0
|
| 185 |
# via rich
|
| 186 |
markupsafe==2.1.5
|
|
@@ -201,11 +226,17 @@ multidict==6.1.0
|
|
| 201 |
# yarl
|
| 202 |
multiprocess==0.70.16
|
| 203 |
# via datasets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
networkx==3.3
|
| 205 |
# via torch
|
| 206 |
numpy==1.26.4
|
| 207 |
# via
|
| 208 |
# accelerate
|
|
|
|
| 209 |
# colpali-engine
|
| 210 |
# contourpy
|
| 211 |
# datasets
|
|
@@ -217,6 +248,8 @@ numpy==1.26.4
|
|
| 217 |
# scikit-learn
|
| 218 |
# scipy
|
| 219 |
# seaborn
|
|
|
|
|
|
|
| 220 |
# transformers
|
| 221 |
# vidore-benchmark
|
| 222 |
oauthlib==3.2.2
|
|
@@ -229,7 +262,10 @@ packaging==24.1
|
|
| 229 |
# huggingface-hub
|
| 230 |
# matplotlib
|
| 231 |
# peft
|
|
|
|
|
|
|
| 232 |
# transformers
|
|
|
|
| 233 |
pandas==2.2.3
|
| 234 |
# via
|
| 235 |
# datasets
|
|
@@ -247,8 +283,14 @@ pillow==10.4.0
|
|
| 247 |
# pdf2image
|
| 248 |
# sentence-transformers
|
| 249 |
# vidore-benchmark
|
|
|
|
|
|
|
| 250 |
polars==1.9.0
|
| 251 |
# via mteb
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
proto-plus==1.24.0
|
| 253 |
# via
|
| 254 |
# google-ai-generativelanguage
|
|
@@ -277,8 +319,12 @@ pycparser==2.22
|
|
| 277 |
# via cffi
|
| 278 |
pydantic==2.9.2
|
| 279 |
# via
|
|
|
|
| 280 |
# google-generativeai
|
| 281 |
# mteb
|
|
|
|
|
|
|
|
|
|
| 282 |
pydantic-core==2.23.4
|
| 283 |
# via pydantic
|
| 284 |
pygments==2.18.0
|
|
@@ -334,7 +380,9 @@ requests==2.32.3
|
|
| 334 |
# mteb
|
| 335 |
# pyvespa
|
| 336 |
# requests-toolbelt
|
|
|
|
| 337 |
# transformers
|
|
|
|
| 338 |
requests-toolbelt==1.0.0
|
| 339 |
# via pyvespa
|
| 340 |
rich==13.9.2
|
|
@@ -366,27 +414,47 @@ sentence-transformers==3.1.1
|
|
| 366 |
sentencepiece==0.2.0
|
| 367 |
# via vidore-benchmark
|
| 368 |
setuptools==75.1.0
|
| 369 |
-
# via
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
shad4fast==1.2.1
|
| 371 |
# via visual-retrieval-colpali (pyproject.toml)
|
| 372 |
shellingham==1.5.4
|
| 373 |
# via typer
|
| 374 |
six==1.16.0
|
| 375 |
# via python-dateutil
|
|
|
|
|
|
|
| 376 |
sniffio==1.3.1
|
| 377 |
# via
|
| 378 |
# anyio
|
| 379 |
# httpx
|
| 380 |
soupsieve==2.6
|
| 381 |
# via beautifulsoup4
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
sqlite-minutils==3.37.0.post3
|
| 383 |
# via fastlite
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
starlette==0.39.2
|
| 385 |
# via python-fasthtml
|
| 386 |
sympy==1.13.3
|
| 387 |
# via torch
|
| 388 |
tenacity==9.0.0
|
| 389 |
# via pyvespa
|
|
|
|
|
|
|
| 390 |
threadpoolctl==3.5.0
|
| 391 |
# via scikit-learn
|
| 392 |
tokenizers==0.20.0
|
|
@@ -408,6 +476,7 @@ tqdm==4.66.5
|
|
| 408 |
# mteb
|
| 409 |
# peft
|
| 410 |
# sentence-transformers
|
|
|
|
| 411 |
# transformers
|
| 412 |
transformers==4.45.1
|
| 413 |
# via
|
|
@@ -416,10 +485,14 @@ transformers==4.45.1
|
|
| 416 |
# sentence-transformers
|
| 417 |
# vidore-benchmark
|
| 418 |
typer==0.12.5
|
| 419 |
-
# via
|
|
|
|
|
|
|
|
|
|
| 420 |
typing-extensions==4.12.2
|
| 421 |
# via
|
| 422 |
# anyio
|
|
|
|
| 423 |
# google-generativeai
|
| 424 |
# huggingface-hub
|
| 425 |
# mteb
|
|
@@ -448,10 +521,19 @@ vespacli==8.391.23
|
|
| 448 |
# via visual-retrieval-colpali (pyproject.toml)
|
| 449 |
vidore-benchmark==4.0.0
|
| 450 |
# via visual-retrieval-colpali (pyproject.toml)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
watchfiles==0.24.0
|
| 452 |
# via uvicorn
|
|
|
|
|
|
|
| 453 |
websockets==13.1
|
| 454 |
# via uvicorn
|
|
|
|
|
|
|
| 455 |
xxhash==3.5.0
|
| 456 |
# via datasets
|
| 457 |
yarl==1.13.1
|
|
|
|
| 24 |
# via aiohttp
|
| 25 |
beautifulsoup4==4.12.3
|
| 26 |
# via python-fasthtml
|
| 27 |
+
blis==0.7.11
|
| 28 |
+
# via thinc
|
| 29 |
cachetools==5.5.0
|
| 30 |
# via google-auth
|
| 31 |
+
catalogue==2.0.10
|
| 32 |
+
# via
|
| 33 |
+
# spacy
|
| 34 |
+
# srsly
|
| 35 |
+
# thinc
|
| 36 |
certifi==2024.8.30
|
| 37 |
# via
|
| 38 |
# httpcore
|
|
|
|
| 46 |
# via
|
| 47 |
# typer
|
| 48 |
# uvicorn
|
| 49 |
+
cloudpathlib==0.20.0
|
| 50 |
+
# via weasel
|
| 51 |
colpali-engine==0.3.1
|
| 52 |
# via
|
| 53 |
# visual-retrieval-colpali (pyproject.toml)
|
| 54 |
# vidore-benchmark
|
| 55 |
+
confection==0.1.5
|
| 56 |
+
# via
|
| 57 |
+
# thinc
|
| 58 |
+
# weasel
|
| 59 |
contourpy==1.3.0
|
| 60 |
# via matplotlib
|
| 61 |
cryptography==43.0.1
|
| 62 |
# via pyvespa
|
| 63 |
cycler==0.12.1
|
| 64 |
# via matplotlib
|
| 65 |
+
cymem==2.0.8
|
| 66 |
+
# via
|
| 67 |
+
# preshed
|
| 68 |
+
# spacy
|
| 69 |
+
# thinc
|
| 70 |
datasets==2.21.0
|
| 71 |
# via
|
| 72 |
# mteb
|
|
|
|
| 186 |
jinja2==3.1.4
|
| 187 |
# via
|
| 188 |
# pyvespa
|
| 189 |
+
# spacy
|
| 190 |
# torch
|
| 191 |
joblib==1.4.2
|
| 192 |
# via scikit-learn
|
| 193 |
kiwisolver==1.4.7
|
| 194 |
# via matplotlib
|
| 195 |
+
langcodes==3.4.1
|
| 196 |
+
# via spacy
|
| 197 |
+
language-data==1.2.0
|
| 198 |
+
# via langcodes
|
| 199 |
loguru==0.7.2
|
| 200 |
# via vidore-benchmark
|
| 201 |
lucide-fasthtml==0.0.9
|
|
|
|
| 204 |
# via
|
| 205 |
# lucide-fasthtml
|
| 206 |
# pyvespa
|
| 207 |
+
marisa-trie==1.2.1
|
| 208 |
+
# via language-data
|
| 209 |
markdown-it-py==3.0.0
|
| 210 |
# via rich
|
| 211 |
markupsafe==2.1.5
|
|
|
|
| 226 |
# yarl
|
| 227 |
multiprocess==0.70.16
|
| 228 |
# via datasets
|
| 229 |
+
murmurhash==1.0.10
|
| 230 |
+
# via
|
| 231 |
+
# preshed
|
| 232 |
+
# spacy
|
| 233 |
+
# thinc
|
| 234 |
networkx==3.3
|
| 235 |
# via torch
|
| 236 |
numpy==1.26.4
|
| 237 |
# via
|
| 238 |
# accelerate
|
| 239 |
+
# blis
|
| 240 |
# colpali-engine
|
| 241 |
# contourpy
|
| 242 |
# datasets
|
|
|
|
| 248 |
# scikit-learn
|
| 249 |
# scipy
|
| 250 |
# seaborn
|
| 251 |
+
# spacy
|
| 252 |
+
# thinc
|
| 253 |
# transformers
|
| 254 |
# vidore-benchmark
|
| 255 |
oauthlib==3.2.2
|
|
|
|
| 262 |
# huggingface-hub
|
| 263 |
# matplotlib
|
| 264 |
# peft
|
| 265 |
+
# spacy
|
| 266 |
+
# thinc
|
| 267 |
# transformers
|
| 268 |
+
# weasel
|
| 269 |
pandas==2.2.3
|
| 270 |
# via
|
| 271 |
# datasets
|
|
|
|
| 283 |
# pdf2image
|
| 284 |
# sentence-transformers
|
| 285 |
# vidore-benchmark
|
| 286 |
+
pip==24.3.1
|
| 287 |
+
# via visual-retrieval-colpali (pyproject.toml)
|
| 288 |
polars==1.9.0
|
| 289 |
# via mteb
|
| 290 |
+
preshed==3.0.9
|
| 291 |
+
# via
|
| 292 |
+
# spacy
|
| 293 |
+
# thinc
|
| 294 |
proto-plus==1.24.0
|
| 295 |
# via
|
| 296 |
# google-ai-generativelanguage
|
|
|
|
| 319 |
# via cffi
|
| 320 |
pydantic==2.9.2
|
| 321 |
# via
|
| 322 |
+
# confection
|
| 323 |
# google-generativeai
|
| 324 |
# mteb
|
| 325 |
+
# spacy
|
| 326 |
+
# thinc
|
| 327 |
+
# weasel
|
| 328 |
pydantic-core==2.23.4
|
| 329 |
# via pydantic
|
| 330 |
pygments==2.18.0
|
|
|
|
| 380 |
# mteb
|
| 381 |
# pyvespa
|
| 382 |
# requests-toolbelt
|
| 383 |
+
# spacy
|
| 384 |
# transformers
|
| 385 |
+
# weasel
|
| 386 |
requests-toolbelt==1.0.0
|
| 387 |
# via pyvespa
|
| 388 |
rich==13.9.2
|
|
|
|
| 414 |
sentencepiece==0.2.0
|
| 415 |
# via vidore-benchmark
|
| 416 |
setuptools==75.1.0
|
| 417 |
+
# via
|
| 418 |
+
# visual-retrieval-colpali (pyproject.toml)
|
| 419 |
+
# marisa-trie
|
| 420 |
+
# spacy
|
| 421 |
+
# thinc
|
| 422 |
shad4fast==1.2.1
|
| 423 |
# via visual-retrieval-colpali (pyproject.toml)
|
| 424 |
shellingham==1.5.4
|
| 425 |
# via typer
|
| 426 |
six==1.16.0
|
| 427 |
# via python-dateutil
|
| 428 |
+
smart-open==7.0.5
|
| 429 |
+
# via weasel
|
| 430 |
sniffio==1.3.1
|
| 431 |
# via
|
| 432 |
# anyio
|
| 433 |
# httpx
|
| 434 |
soupsieve==2.6
|
| 435 |
# via beautifulsoup4
|
| 436 |
+
spacy==3.7.5
|
| 437 |
+
# via visual-retrieval-colpali (pyproject.toml)
|
| 438 |
+
spacy-legacy==3.0.12
|
| 439 |
+
# via spacy
|
| 440 |
+
spacy-loggers==1.0.5
|
| 441 |
+
# via spacy
|
| 442 |
sqlite-minutils==3.37.0.post3
|
| 443 |
# via fastlite
|
| 444 |
+
srsly==2.4.8
|
| 445 |
+
# via
|
| 446 |
+
# confection
|
| 447 |
+
# spacy
|
| 448 |
+
# thinc
|
| 449 |
+
# weasel
|
| 450 |
starlette==0.39.2
|
| 451 |
# via python-fasthtml
|
| 452 |
sympy==1.13.3
|
| 453 |
# via torch
|
| 454 |
tenacity==9.0.0
|
| 455 |
# via pyvespa
|
| 456 |
+
thinc==8.2.5
|
| 457 |
+
# via spacy
|
| 458 |
threadpoolctl==3.5.0
|
| 459 |
# via scikit-learn
|
| 460 |
tokenizers==0.20.0
|
|
|
|
| 476 |
# mteb
|
| 477 |
# peft
|
| 478 |
# sentence-transformers
|
| 479 |
+
# spacy
|
| 480 |
# transformers
|
| 481 |
transformers==4.45.1
|
| 482 |
# via
|
|
|
|
| 485 |
# sentence-transformers
|
| 486 |
# vidore-benchmark
|
| 487 |
typer==0.12.5
|
| 488 |
+
# via
|
| 489 |
+
# spacy
|
| 490 |
+
# vidore-benchmark
|
| 491 |
+
# weasel
|
| 492 |
typing-extensions==4.12.2
|
| 493 |
# via
|
| 494 |
# anyio
|
| 495 |
+
# cloudpathlib
|
| 496 |
# google-generativeai
|
| 497 |
# huggingface-hub
|
| 498 |
# mteb
|
|
|
|
| 521 |
# via visual-retrieval-colpali (pyproject.toml)
|
| 522 |
vidore-benchmark==4.0.0
|
| 523 |
# via visual-retrieval-colpali (pyproject.toml)
|
| 524 |
+
wasabi==1.1.3
|
| 525 |
+
# via
|
| 526 |
+
# spacy
|
| 527 |
+
# thinc
|
| 528 |
+
# weasel
|
| 529 |
watchfiles==0.24.0
|
| 530 |
# via uvicorn
|
| 531 |
+
weasel==0.4.1
|
| 532 |
+
# via spacy
|
| 533 |
websockets==13.1
|
| 534 |
# via uvicorn
|
| 535 |
+
wrapt==1.16.0
|
| 536 |
+
# via smart-open
|
| 537 |
xxhash==3.5.0
|
| 538 |
# via datasets
|
| 539 |
yarl==1.13.1
|
vespa_feed_to_hf_dataset.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
import os
|
| 4 |
+
import base64
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import io
|
| 7 |
+
from datasets import Dataset, Image as HFImage
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
df = pd.read_json("output/vespa_feed_full.jsonl", lines=True)
|
| 14 |
+
df = pd.json_normalize(df["fields"].tolist())
|
| 15 |
+
|
| 16 |
+
dataset_dir = Path("hf_dataset")
|
| 17 |
+
image_dir = dataset_dir / "images"
|
| 18 |
+
os.makedirs(image_dir, exist_ok=True)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def save_image(image_data, filename):
|
| 22 |
+
img_data = base64.b64decode(image_data)
|
| 23 |
+
img = Image.open(io.BytesIO(img_data))
|
| 24 |
+
img.save(filename)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
for idx, row in tqdm(df.iterrows()):
|
| 28 |
+
blur_filename = os.path.join(image_dir, f"blur_{idx}.jpg")
|
| 29 |
+
full_filename = os.path.join(image_dir, f"full_{idx}.jpg")
|
| 30 |
+
save_image(row["blur_image"], blur_filename)
|
| 31 |
+
save_image(row["full_image"], full_filename)
|
| 32 |
+
df.at[idx, "blur_image"] = blur_filename
|
| 33 |
+
df.at[idx, "full_image"] = full_filename
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# Step 3: Convert to Hugging Face Dataset
|
| 37 |
+
dataset = (
|
| 38 |
+
Dataset.from_dict(df.to_dict(orient="list"))
|
| 39 |
+
.cast_column("blur_image", HFImage())
|
| 40 |
+
.cast_column("full_image", HFImage())
|
| 41 |
+
)
|
| 42 |
+
dataset.push_to_hub("vespa-engine/gpfg-QA", private=True)
|