File size: 2,818 Bytes
7a10db2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
import re
from typing import Dict, List, Optional, Tuple
import PyPDF2
import docx2txt
from PIL import Image
import pytesseract
import io

class ResumeScanner:
    """Simple resume text extractor - no complex analysis needed for vector search"""
    
    def __init__(self):
        pass

    def extract_text_from_file(self, file_content: bytes, filename: str) -> str:
        """Extract text from various file formats."""
        file_ext = filename.lower().split('.')[-1]
        
        try:
            if file_ext == 'pdf':
                return self._extract_from_pdf(file_content)
            elif file_ext in ['doc', 'docx']:
                return self._extract_from_docx(file_content)
            elif file_ext in ['txt']:
                return file_content.decode('utf-8')
            elif file_ext in ['jpg', 'jpeg', 'png', 'bmp', 'tiff']:
                return self._extract_from_image(file_content)
            else:
                raise ValueError(f"Unsupported file format: {file_ext}")
        except Exception as e:
            print(f"❌ Error extracting text from {filename}: {e}")
            return ""

    def _extract_from_pdf(self, file_content: bytes) -> str:
        """Extract text from PDF file."""
        try:
            pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
            return text
        except Exception as e:
            print(f"❌ Error reading PDF: {e}")
            return ""

    def _extract_from_docx(self, file_content: bytes) -> str:
        """Extract text from DOCX file."""
        try:
            return docx2txt.process(io.BytesIO(file_content))
        except Exception as e:
            print(f"❌ Error reading DOCX: {e}")
            return ""

    def _extract_from_image(self, file_content: bytes) -> str:
        """Extract text from image using OCR."""
        try:
            image = Image.open(io.BytesIO(file_content))
            # Use OCR to extract text
            text = pytesseract.image_to_string(image)
            return text
        except Exception as e:
            print(f"❌ Error reading image with OCR: {e}")
            return ""

    def clean_extracted_text(self, text: str) -> str:
        """Clean and optimize extracted text for better vector search."""
        if not text:
            return ""
        
        # Remove excessive whitespace and newlines
        text = re.sub(r'\n+', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        
        # Remove special characters that might interfere with search
        text = re.sub(r'[^\w\s.,@-]', ' ', text)
        
        # Trim and return
        return text.strip()

# Global instance
resume_scanner = ResumeScanner()