chore: update something
Browse files- docsifer/service.py +9 -2
docsifer/service.py
CHANGED
|
@@ -111,18 +111,25 @@ class DocsiferService:
|
|
| 111 |
# Use a temp directory so MarkItDown sees the real file extension
|
| 112 |
with tempfile.TemporaryDirectory() as tmpdir:
|
| 113 |
mime_type = magic.from_file(str(src), mime=True)
|
|
|
|
| 114 |
if not mime_type:
|
| 115 |
logger.warning(f"Could not detect file type for: {src}")
|
| 116 |
new_filename = src.name
|
| 117 |
else:
|
| 118 |
logger.debug(f"Detected MIME type '{mime_type}' for: {src}")
|
| 119 |
-
guessed_ext = mimetypes.guess_extension(mime_type) or ""
|
| 120 |
new_filename = f"{src.stem}{guessed_ext}"
|
| 121 |
tmp_path = Path(tmpdir) / new_filename
|
| 122 |
tmp_path.write_bytes(src.read_bytes())
|
| 123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
# If it's HTML and cleanup is requested
|
| 125 |
-
if cleanup and
|
| 126 |
self._maybe_cleanup_html(tmp_path)
|
| 127 |
|
| 128 |
# Decide whether to use LLM or basic
|
|
|
|
| 111 |
# Use a temp directory so MarkItDown sees the real file extension
|
| 112 |
with tempfile.TemporaryDirectory() as tmpdir:
|
| 113 |
mime_type = magic.from_file(str(src), mime=True)
|
| 114 |
+
guessed_ext = mimetypes.guess_extension(mime_type) or ".tmp"
|
| 115 |
if not mime_type:
|
| 116 |
logger.warning(f"Could not detect file type for: {src}")
|
| 117 |
new_filename = src.name
|
| 118 |
else:
|
| 119 |
logger.debug(f"Detected MIME type '{mime_type}' for: {src}")
|
|
|
|
| 120 |
new_filename = f"{src.stem}{guessed_ext}"
|
| 121 |
tmp_path = Path(tmpdir) / new_filename
|
| 122 |
tmp_path.write_bytes(src.read_bytes())
|
| 123 |
|
| 124 |
+
logger.info(
|
| 125 |
+
"Using temp file: %s, MIME type: %s, Guessed ext: %s",
|
| 126 |
+
tmp_path,
|
| 127 |
+
mime_type,
|
| 128 |
+
guessed_ext,
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
# If it's HTML and cleanup is requested
|
| 132 |
+
if cleanup and guessed_ext.lower() in (".html", ".htm"):
|
| 133 |
self._maybe_cleanup_html(tmp_path)
|
| 134 |
|
| 135 |
# Decide whether to use LLM or basic
|