Unstructured-IO · longway-code · Feb 8, 2026
diff --git a/example-docs/emoji.xlsm b/example-docs/emoji.xlsm
diff --git a/example-docs/empty.xlsm b/example-docs/empty.xlsm
diff --git a/example-docs/multi-sheet-test.xlsm b/example-docs/multi-sheet-test.xlsm
diff --git a/example-docs/stanley-cups.xlsm b/example-docs/stanley-cups.xlsm
diff --git a/example-docs/xlsx-subtable-cases.xlsm b/example-docs/xlsx-subtable-cases.xlsm
diff --git a/test_unstructured/partition/test_xlsm.py b/test_unstructured/partition/test_xlsm.py
@@ -0,0 +1,222 @@
+# pyright: reportPrivateUsage=false
+
+"""Test-suite for the `unstructured.partition.xlsx` module (XLSM support)."""
+
+from __future__ import annotations
+
+import io
+import tempfile
+
+import pytest
+
+from test_unstructured.partition.test_constants import (
+    EXPECTED_TABLE_XLSX,
+    EXPECTED_TEXT_XLSX,
+    EXPECTED_TITLE,
+)
+from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
+from unstructured.cleaners.core import clean_extra_whitespace
+from unstructured.documents.elements import Table, Text, Title
+from unstructured.partition.xlsx import partition_xlsm
+
+# -- XLSM has a different MIME type than XLSX --
+EXPECTED_FILETYPE_XLSM = "application/vnd.ms-excel.sheet.macroEnabled.12"
+EXPECTED_PAGE_NAME = "Stanley Cups"
+
+
+# ------------------------------------------------------------------------------------------------
+# INTEGRATION TESTS
+# ------------------------------------------------------------------------------------------------
+# These test `partition_xlsm()` as a whole by calling `partition_xlsm()` and inspecting the
+# outputs. XLSM files have the same structure as XLSX files but with macro support.
+# ------------------------------------------------------------------------------------------------
+
+
+def test_partition_xlsm_from_filename():
+    """Test that partition_xlsm can process an XLSM file from filename."""
+    elements = partition_xlsm("example-docs/stanley-cups.xlsm", include_header=False)
+
+    assert sum(isinstance(element, Table) for element in elements) == 2
+    assert len(elements) == 4
+
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
+    assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
+    assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
+    assert elements[1].metadata.page_number == 1
+    assert elements[1].metadata.filetype == EXPECTED_FILETYPE_XLSM
+    assert elements[1].metadata.page_name == EXPECTED_PAGE_NAME
+    assert elements[1].metadata.filename == "stanley-cups.xlsm"
+
+
+def test_partition_xlsm_from_file():
+    """Test that partition_xlsm can process an XLSM file from file object."""
+    with open("example-docs/stanley-cups.xlsm", "rb") as f:
+        elements = partition_xlsm(file=f, include_header=False)
+
+    assert sum(isinstance(element, Table) for element in elements) == 2
+    assert len(elements) == 4
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
+    assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
+    assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
+    assert elements[1].metadata.page_number == 1
+    assert elements[1].metadata.filetype == EXPECTED_FILETYPE_XLSM
+    assert elements[1].metadata.page_name == EXPECTED_PAGE_NAME
+    assert elements[1].metadata.filename is None
+
+
+def test_partition_xlsm_from_file_with_metadata_filename():
+    """Test that metadata_filename parameter works correctly for XLSM files."""
+    with open("example-docs/stanley-cups.xlsm", "rb") as f:
+        elements = partition_xlsm(
+            file=f, metadata_filename="custom-name.xlsm", include_header=False
+        )
+
+    assert elements[0].metadata.filename == "custom-name.xlsm"
+
+
+def test_partition_xlsm_from_file_like_object_with_name():
+    """Test that partition_xlsm works with file-like objects that have a name attribute."""
+    with open("example-docs/stanley-cups.xlsm", "rb") as f:
+        file = io.BytesIO(f.read())
+    file.name = "stanley-cups-downloaded.xlsm"
+
+    elements = partition_xlsm(file=file, include_header=False)
+
+    assert sum(isinstance(element, Table) for element in elements) == 2
+    assert len(elements) == 4
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
+    assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
+    assert elements[1].metadata.page_number == 1
+    assert elements[1].metadata.filetype == EXPECTED_FILETYPE_XLSM
+
+
+def test_partition_xlsm_from_SpooledTemporaryFile_with_emoji():
+    """Test that partition_xlsm handles emoji characters correctly in XLSM files."""
+    with tempfile.SpooledTemporaryFile() as f:
+        with open("example-docs/emoji.xlsm", "rb") as g:
+            f.write(g.read())
+
+        elements = partition_xlsm(file=f, include_header=False)
+
+    assert sum(isinstance(element, Text) for element in elements) == 1
+    assert len(elements) == 1
+    assert clean_extra_whitespace(elements[0].text) == "🤠😅"
+
+
+def test_partition_xlsm_with_multiple_sheets():
+    """Test that partition_xlsm correctly processes XLSM files with multiple worksheets."""
+    elements = partition_xlsm("example-docs/multi-sheet-test.xlsm", include_header=False)
+
+    # -- Should have elements from both sheets --
+    assert len(elements) > 0
+
+    # -- Check that we have tables from multiple sheets --
+    page_names = {e.metadata.page_name for e in elements if hasattr(e.metadata, "page_name")}
+    assert len(page_names) >= 2  # -- At least 2 different sheet names --
+
+
+def test_partition_xlsm_with_subtables():
+    """Test that partition_xlsm correctly detects subtables in XLSM files."""
+    elements = partition_xlsm("example-docs/xlsx-subtable-cases.xlsm", find_subtable=True)
+
+    # -- With subtable detection, we should get separate elements for subtables --
+    assert len(elements) > 1
+
+
+def test_partition_xlsm_without_subtables():
+    """Test that partition_xlsm treats entire sheet as one table when find_subtable=False."""
+    elements = partition_xlsm("example-docs/xlsx-subtable-cases.xlsm", find_subtable=False)
+
+    # -- Without subtable detection, entire sheet is one table --
+    assert len(elements) == 1
+    assert isinstance(elements[0], Table)
+
+
+@pytest.mark.parametrize("infer_table_structure", [True, False])
+def test_partition_xlsm_infer_table_structure(infer_table_structure: bool):
+    """Test that infer_table_structure parameter controls HTML table generation."""
+    elements = partition_xlsm(
+        "example-docs/stanley-cups.xlsm", infer_table_structure=infer_table_structure
+    )
+    table_elements = [e for e in elements if isinstance(e, Table)]
+    for table_element in table_elements:
+        table_element_has_text_as_html_field = (
+            hasattr(table_element.metadata, "text_as_html")
+            and table_element.metadata.text_as_html is not None
+        )
+        assert table_element_has_text_as_html_field == infer_table_structure
+
+
+def test_partition_xlsm_with_header():
+    """Test that partition_xlsm includes header when include_header=True."""
+    elements = partition_xlsm("example-docs/stanley-cups.xlsm", include_header=True)
+
+    assert len(elements) == 2
+    assert all(isinstance(e, Table) for e in elements)
+    e = elements[0]
+    # -- Header row is included in the text --
+    assert e.text == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
+    assert e.metadata.text_as_html is not None
+
+
+def test_partition_xlsm_from_empty_file():
+    """Test that partition_xlsm handles empty XLSM files gracefully."""
+    elements = partition_xlsm("example-docs/empty.xlsm")
+
+    # -- Empty file should return empty list or minimal elements --
+    assert isinstance(elements, list)
+
+
+def test_partition_xlsm_metadata_page_numbers():
+    """Test that page numbers are correctly assigned to elements from multiple sheets."""
+    elements = partition_xlsm("example-docs/multi-sheet-test.xlsm", starting_page_number=5)
+
+    # -- Page numbers should start from the specified starting_page_number --
+    page_numbers = [e.metadata.page_number for e in elements if hasattr(e.metadata, "page_number")]
+    assert min(page_numbers) >= 5
+
+
+def test_partition_xlsm_raises_on_no_file_or_path():
+    """Test that partition_xlsm raises ValueError when neither file nor filename is provided."""
+    with pytest.raises(ValueError, match="Either 'filename' or 'file' argument must be specif"):
+        partition_xlsm()
+
+
+def test_partition_xlsm_serializable_to_json():
+    """Test that elements from partition_xlsm can be serialized to JSON."""
+    elements = partition_xlsm("example-docs/stanley-cups.xlsm", include_header=False)
+
+    # -- Elements should be serializable to JSON --
+    assert_round_trips_through_JSON(elements)
+
+
+# ------------------------------------------------------------------------------------------------
+# FILE TYPE DETECTION TESTS
+# ------------------------------------------------------------------------------------------------
+# Test that XLSM files are correctly detected by the auto-detection system
+# ------------------------------------------------------------------------------------------------
+
+
+def test_auto_partition_xlsm_from_filename():
+    """Test that partition() auto-detects XLSM files and uses partition_xlsm."""
+    from unstructured.partition.auto import partition
+
+    elements = partition("example-docs/stanley-cups.xlsm", include_header=False)
+
+    # -- Should successfully partition the file --
+    assert len(elements) > 0
+    # -- Should use XLSM MIME type --
+    assert elements[0].metadata.filetype == EXPECTED_FILETYPE_XLSM
+
+
+def test_auto_partition_xlsm_from_file():
+    """Test that partition() auto-detects XLSM files from file objects."""
+    from unstructured.partition.auto import partition
+
+    with open("example-docs/stanley-cups.xlsm", "rb") as f:
+        elements = partition(file=f, metadata_filename="test.xlsm", include_header=False)
+
+    # -- Should successfully partition the file --
+    assert len(elements) > 0
+    # -- Should use XLSM MIME type --
+    assert elements[0].metadata.filetype == EXPECTED_FILETYPE_XLSM
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
@@ -731,8 +731,8 @@ def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType | None:
     def _file_type(self) -> FileType | None:
         """Differentiated file-type for a Zip archive.
 
-        Returns `FileType.DOCX`, `FileType.PPTX`, or `FileType.XLSX` when one of those applies,
-        `None` otherwise.
+        Returns `FileType.DOCX`, `FileType.PPTX`, `FileType.XLSX`, or `FileType.XLSM` when one of
+        those applies, `None` otherwise.
         """
         if not self._ctx.is_zipfile:
             return None
@@ -746,6 +746,10 @@ def _file_type(self) -> FileType | None:
                 return FileType.DOCX
 
             if any(re.match(r"xl/workbook.*\.xml$", filename) for filename in filenames):
+                # -- Both XLSX and XLSM have the same internal structure (xl/workbook.xml).
+                # -- Distinguish them by file extension since XLSM is macro-enabled XLSX.
+                if self._ctx.extension == ".xlsm":
+                    return FileType.XLSM
                 return FileType.XLSX
 
             if any(re.match(r"ppt/presentation.*\.xml$", filename) for filename in filenames):

diff --git a/unstructured/file_utils/model.py b/unstructured/file_utils/model.py
@@ -472,6 +472,15 @@ def partitioner_shortname(self) -> str | None:
         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
         cast(list[str], []),
     )
+    XLSM = (
+        "xlsm",
+        "xlsx",  # -- uses the same partitioner as XLSX --
+        ["pandas", "openpyxl"],
+        "xlsx",
+        [".xlsm"],
+        "application/vnd.ms-excel.sheet.macroEnabled.12",
+        cast(list[str], []),
+    )
     XML = ("xml", "xml", cast(list[str], []), None, [".xml"], "application/xml", ["text/xml"])
     ZIP = ("zip", None, cast(list[str], []), None, [".zip"], "application/zip", cast(list[str], []))
 

diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py
@@ -461,3 +461,56 @@ def _get_metadata(
         filename=opts.metadata_file_path,
         last_modified=opts.last_modified,
     )
+
+
+@apply_metadata(FileType.XLSM)
+@add_chunking_strategy
+def partition_xlsm(
+    filename: Optional[str] = None,
+    *,
+    file: Optional[IO[bytes]] = None,
+    find_subtable: bool = True,
+    include_header: bool = False,
+    infer_table_structure: bool = True,
+    starting_page_number: int = 1,
+    **kwargs: Any,
+) -> list[Element]:
+    """Partitions Microsoft Excel Macro-Enabled Documents in .xlsm format into document elements.
+
+    XLSM files are Excel workbooks that can contain VBA macros. They have the same structure as
+    XLSX files (Office Open XML format) but with a different MIME type. This function extracts
+    only the data content; macros are not processed.
+
+    Parameters
+    ----------
+    filename
+        A string defining the target filename path.
+    file
+        A file-like object using "rb" mode --> open(filename, "rb").
+    find_subtable
+        Detect "subtables" on each worksheet and partition each of those as a separate `Table`
+        element. When `False`, each worksheet is partitioned as a single `Table` element. A
+        subtable is a contiguous block of cells with more than two cells in each row.
+    infer_table_structure
+        If True, any Table elements that are extracted will also have a metadata field
+        named "text_as_html" where the table's text content is rendered into an html string.
+        I.e., rows and cells are preserved.
+        Whether True or False, the "text" field is always present in any Table element
+        and is the text content of the table (no structure).
+    include_header
+        Determines whether or not header info is included in text and metadata.text_as_html
+    starting_page_number
+        The starting page number to assign to the first worksheet. Subsequent sheets are
+        numbered sequentially.
+    """
+    # -- XLSM files have the same data structure as XLSX files, so we can use the same
+    # -- partitioning logic. The only difference is the file extension and MIME type.
+    return partition_xlsx(
+        filename=filename,
+        file=file,
+        find_subtable=find_subtable,
+        include_header=include_header,
+        infer_table_structure=infer_table_structure,
+        starting_page_number=starting_page_number,
+        **kwargs,
+    )