From 56e977ffb5cae8775905dda66b19f9809175b522 Mon Sep 17 00:00:00 2001 From: Robert Helewka Date: Mon, 4 May 2026 12:39:54 -0400 Subject: [PATCH] fix(library): normalize MIME types to file extensions in Daedalus ingest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Daedalus may send `file_type` as a MIME type (e.g. `text/markdown`) rather than a bare extension. Add a `_normalize_file_type` helper with a MIME→ext lookup table and sensible fallbacks so ingested items are stored with proper extensions like `md` instead of `text/markdown`. --- mnemosyne/library/tasks.py | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/mnemosyne/library/tasks.py b/mnemosyne/library/tasks.py index ddd6b05..1290211 100644 --- a/mnemosyne/library/tasks.py +++ b/mnemosyne/library/tasks.py @@ -17,6 +17,41 @@ logger = logging.getLogger(__name__) # Cache key pattern for task progress PROGRESS_KEY = "library:task:{task_id}:progress" +# MIME type → file extension, for when Daedalus sends content_type as file_type +_MIME_TO_EXT = { + "text/markdown": "md", + "text/plain": "txt", + "text/html": "html", + "text/csv": "csv", + "text/xml": "xml", + "application/pdf": "pdf", + "application/epub+zip": "epub", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx", + "application/json": "json", + "image/jpeg": "jpg", + "image/png": "png", + "image/gif": "gif", + "image/webp": "webp", + "image/tiff": "tiff", +} + + +def _normalize_file_type(raw: str) -> str: + """Convert a MIME type or extension string to a bare extension.""" + raw = (raw or "").strip().lower() + if "/" in raw: + # It's a MIME type — look up or derive from the subtype + ext = _MIME_TO_EXT.get(raw) + if ext: + return ext + # Fallback: use the part after the slash, strip vendor prefixes + subtype = raw.split("/", 1)[1] + subtype = subtype.split("+")[-1] # e.g. "epub+zip" → "zip"; "vnd.ms-excel" → keep + return subtype.lstrip(".") or "bin" + return raw.lstrip(".") or "bin" + def _update_progress(task, percent: int, message: str): """ @@ -373,7 +408,7 @@ def ingest_from_daedalus(self, job_id: str): data = fetch_from_daedalus(job.s3_key) # --- 4. Create Item node --- - ext = (job.file_type or "bin").lstrip(".").lower() or "bin" + ext = _normalize_file_type(job.file_type) item = Item( title=job.title, file_type=ext,