File size: 2,624 Bytes
0106d0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""
Vision tools: image captioning using Hugging Face Inference API with a local fallback.

Functions:
- `caption_image(path)`: returns a short caption for an image file.
"""
from typing import Optional
import os
import logging

logger = logging.getLogger(__name__)

try:
    from huggingface_hub import InferenceApi
except Exception:
    InferenceApi = None

try:
    from transformers import pipeline
except Exception:
    pipeline = None

try:
    from smolagents import tool
except Exception:
    tool = None


def caption_image(path: str, model: str = "nlpconnect/vit-gpt2-image-captioning") -> str:
    """Caption an image at `path` using HF Inference API when possible, otherwise local pipeline if available.

    Returns a short textual caption or an error string.
    """
    if not os.path.exists(path):
        return "(file not found)"

    hf_token = os.environ.get("HF_TOKEN")
    # Try Inference API first
    if hf_token and InferenceApi is not None:
        try:
            client = InferenceApi(repo_id=model, token=hf_token)
            with open(path, "rb") as f:
                out = client(inputs=f)
            # InferenceApi for image-to-text may return text or structure
            if isinstance(out, dict) and "generated_text" in out:
                return out["generated_text"].strip()
            if isinstance(out, list) and len(out) > 0:
                first = out[0]
                if isinstance(first, dict) and "generated_text" in first:
                    return first["generated_text"].strip()
                return str(first)
            return str(out)
        except Exception as e:
            logger.warning("HF Inference image captioning failed: %s", e)

    # Local pipeline fallback (may not be installed or suitable for large models)
    if pipeline is not None:
        try:
            pipe = pipeline("image-to-text", model=model)
            res = pipe(path)
            if isinstance(res, list) and len(res) > 0:
                return res[0].get("generated_text", str(res[0]))
            return str(res)
        except Exception as e:
            logger.warning("Local pipeline image captioning failed: %s", e)

    return "(image captioning unavailable)"


# Export a smolagents-wrapped tool if possible
if tool is not None:
    try:
        @tool
        def caption_image_tool(path: str, model: str = "nlpconnect/vit-gpt2-image-captioning") -> str:
            return caption_image(path, model=model)
    except Exception:
        caption_image_tool = caption_image
else:
    caption_image_tool = caption_image

__all__ = ["caption_image", "caption_image_tool"]