|
|
import os |
|
|
import re |
|
|
from typing import Dict, List, Optional, Tuple |
|
|
import PyPDF2 |
|
|
import docx2txt |
|
|
from PIL import Image |
|
|
import pytesseract |
|
|
import io |
|
|
|
|
|
class ResumeScanner: |
|
|
"""Simple resume text extractor - no complex analysis needed for vector search""" |
|
|
|
|
|
def __init__(self): |
|
|
pass |
|
|
|
|
|
def extract_text_from_file(self, file_content: bytes, filename: str) -> str: |
|
|
"""Extract text from various file formats.""" |
|
|
file_ext = filename.lower().split('.')[-1] |
|
|
|
|
|
try: |
|
|
if file_ext == 'pdf': |
|
|
return self._extract_from_pdf(file_content) |
|
|
elif file_ext in ['doc', 'docx']: |
|
|
return self._extract_from_docx(file_content) |
|
|
elif file_ext in ['txt']: |
|
|
return file_content.decode('utf-8') |
|
|
elif file_ext in ['jpg', 'jpeg', 'png', 'bmp', 'tiff']: |
|
|
return self._extract_from_image(file_content) |
|
|
else: |
|
|
raise ValueError(f"Unsupported file format: {file_ext}") |
|
|
except Exception as e: |
|
|
print(f"β Error extracting text from {filename}: {e}") |
|
|
return "" |
|
|
|
|
|
def _extract_from_pdf(self, file_content: bytes) -> str: |
|
|
"""Extract text from PDF file.""" |
|
|
try: |
|
|
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content)) |
|
|
text = "" |
|
|
for page in pdf_reader.pages: |
|
|
text += page.extract_text() + "\n" |
|
|
return text |
|
|
except Exception as e: |
|
|
print(f"β Error reading PDF: {e}") |
|
|
return "" |
|
|
|
|
|
def _extract_from_docx(self, file_content: bytes) -> str: |
|
|
"""Extract text from DOCX file.""" |
|
|
try: |
|
|
return docx2txt.process(io.BytesIO(file_content)) |
|
|
except Exception as e: |
|
|
print(f"β Error reading DOCX: {e}") |
|
|
return "" |
|
|
|
|
|
def _extract_from_image(self, file_content: bytes) -> str: |
|
|
"""Extract text from image using OCR.""" |
|
|
try: |
|
|
image = Image.open(io.BytesIO(file_content)) |
|
|
|
|
|
text = pytesseract.image_to_string(image) |
|
|
return text |
|
|
except Exception as e: |
|
|
print(f"β Error reading image with OCR: {e}") |
|
|
return "" |
|
|
|
|
|
def clean_extracted_text(self, text: str) -> str: |
|
|
"""Clean and optimize extracted text for better vector search.""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
|
|
|
text = re.sub(r'\n+', ' ', text) |
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
|
|
|
text = re.sub(r'[^\w\s.,@-]', ' ', text) |
|
|
|
|
|
|
|
|
return text.strip() |
|
|
|
|
|
|
|
|
resume_scanner = ResumeScanner() |