File size: 2,818 Bytes
7a10db2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import os
import re
from typing import Dict, List, Optional, Tuple
import PyPDF2
import docx2txt
from PIL import Image
import pytesseract
import io
class ResumeScanner:
"""Simple resume text extractor - no complex analysis needed for vector search"""
def __init__(self):
pass
def extract_text_from_file(self, file_content: bytes, filename: str) -> str:
"""Extract text from various file formats."""
file_ext = filename.lower().split('.')[-1]
try:
if file_ext == 'pdf':
return self._extract_from_pdf(file_content)
elif file_ext in ['doc', 'docx']:
return self._extract_from_docx(file_content)
elif file_ext in ['txt']:
return file_content.decode('utf-8')
elif file_ext in ['jpg', 'jpeg', 'png', 'bmp', 'tiff']:
return self._extract_from_image(file_content)
else:
raise ValueError(f"Unsupported file format: {file_ext}")
except Exception as e:
print(f"β Error extracting text from {filename}: {e}")
return ""
def _extract_from_pdf(self, file_content: bytes) -> str:
"""Extract text from PDF file."""
try:
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
print(f"β Error reading PDF: {e}")
return ""
def _extract_from_docx(self, file_content: bytes) -> str:
"""Extract text from DOCX file."""
try:
return docx2txt.process(io.BytesIO(file_content))
except Exception as e:
print(f"β Error reading DOCX: {e}")
return ""
def _extract_from_image(self, file_content: bytes) -> str:
"""Extract text from image using OCR."""
try:
image = Image.open(io.BytesIO(file_content))
# Use OCR to extract text
text = pytesseract.image_to_string(image)
return text
except Exception as e:
print(f"β Error reading image with OCR: {e}")
return ""
def clean_extracted_text(self, text: str) -> str:
"""Clean and optimize extracted text for better vector search."""
if not text:
return ""
# Remove excessive whitespace and newlines
text = re.sub(r'\n+', ' ', text)
text = re.sub(r'\s+', ' ', text)
# Remove special characters that might interfere with search
text = re.sub(r'[^\w\s.,@-]', ' ', text)
# Trim and return
return text.strip()
# Global instance
resume_scanner = ResumeScanner() |