Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added example-docs/emoji.xlsm
Binary file not shown.
Binary file added example-docs/empty.xlsm
Binary file not shown.
Binary file added example-docs/multi-sheet-test.xlsm
Binary file not shown.
Binary file added example-docs/stanley-cups.xlsm
Binary file not shown.
Binary file added example-docs/xlsx-subtable-cases.xlsm
Binary file not shown.
222 changes: 222 additions & 0 deletions test_unstructured/partition/test_xlsm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
# pyright: reportPrivateUsage=false

"""Test-suite for the `unstructured.partition.xlsx` module (XLSM support)."""

from __future__ import annotations

import io
import tempfile

import pytest

from test_unstructured.partition.test_constants import (
EXPECTED_TABLE_XLSX,
EXPECTED_TEXT_XLSX,
EXPECTED_TITLE,
)
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Table, Text, Title
from unstructured.partition.xlsx import partition_xlsm

# -- XLSM has a different MIME type than XLSX --
EXPECTED_FILETYPE_XLSM = "application/vnd.ms-excel.sheet.macroEnabled.12"
EXPECTED_PAGE_NAME = "Stanley Cups"


# ------------------------------------------------------------------------------------------------
# INTEGRATION TESTS
# ------------------------------------------------------------------------------------------------
# These test `partition_xlsm()` as a whole by calling `partition_xlsm()` and inspecting the
# outputs. XLSM files have the same structure as XLSX files but with macro support.
# ------------------------------------------------------------------------------------------------


def test_partition_xlsm_from_filename():
"""Test that partition_xlsm can process an XLSM file from filename."""
elements = partition_xlsm("example-docs/stanley-cups.xlsm", include_header=False)

assert sum(isinstance(element, Table) for element in elements) == 2
assert len(elements) == 4

assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
assert elements[1].metadata.page_number == 1
assert elements[1].metadata.filetype == EXPECTED_FILETYPE_XLSM
assert elements[1].metadata.page_name == EXPECTED_PAGE_NAME
assert elements[1].metadata.filename == "stanley-cups.xlsm"


def test_partition_xlsm_from_file():
"""Test that partition_xlsm can process an XLSM file from file object."""
with open("example-docs/stanley-cups.xlsm", "rb") as f:
elements = partition_xlsm(file=f, include_header=False)

assert sum(isinstance(element, Table) for element in elements) == 2
assert len(elements) == 4
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
assert elements[1].metadata.page_number == 1
assert elements[1].metadata.filetype == EXPECTED_FILETYPE_XLSM
assert elements[1].metadata.page_name == EXPECTED_PAGE_NAME
assert elements[1].metadata.filename is None


def test_partition_xlsm_from_file_with_metadata_filename():
"""Test that metadata_filename parameter works correctly for XLSM files."""
with open("example-docs/stanley-cups.xlsm", "rb") as f:
elements = partition_xlsm(
file=f, metadata_filename="custom-name.xlsm", include_header=False
)

assert elements[0].metadata.filename == "custom-name.xlsm"


def test_partition_xlsm_from_file_like_object_with_name():
"""Test that partition_xlsm works with file-like objects that have a name attribute."""
with open("example-docs/stanley-cups.xlsm", "rb") as f:
file = io.BytesIO(f.read())
file.name = "stanley-cups-downloaded.xlsm"

elements = partition_xlsm(file=file, include_header=False)

assert sum(isinstance(element, Table) for element in elements) == 2
assert len(elements) == 4
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
assert elements[1].metadata.page_number == 1
assert elements[1].metadata.filetype == EXPECTED_FILETYPE_XLSM


def test_partition_xlsm_from_SpooledTemporaryFile_with_emoji():
"""Test that partition_xlsm handles emoji characters correctly in XLSM files."""
with tempfile.SpooledTemporaryFile() as f:
with open("example-docs/emoji.xlsm", "rb") as g:
f.write(g.read())

elements = partition_xlsm(file=f, include_header=False)

assert sum(isinstance(element, Text) for element in elements) == 1
assert len(elements) == 1
assert clean_extra_whitespace(elements[0].text) == "🤠😅"


def test_partition_xlsm_with_multiple_sheets():
"""Test that partition_xlsm correctly processes XLSM files with multiple worksheets."""
elements = partition_xlsm("example-docs/multi-sheet-test.xlsm", include_header=False)

# -- Should have elements from both sheets --
assert len(elements) > 0

# -- Check that we have tables from multiple sheets --
page_names = {e.metadata.page_name for e in elements if hasattr(e.metadata, "page_name")}
assert len(page_names) >= 2 # -- At least 2 different sheet names --


def test_partition_xlsm_with_subtables():
"""Test that partition_xlsm correctly detects subtables in XLSM files."""
elements = partition_xlsm("example-docs/xlsx-subtable-cases.xlsm", find_subtable=True)

# -- With subtable detection, we should get separate elements for subtables --
assert len(elements) > 1


def test_partition_xlsm_without_subtables():
"""Test that partition_xlsm treats entire sheet as one table when find_subtable=False."""
elements = partition_xlsm("example-docs/xlsx-subtable-cases.xlsm", find_subtable=False)

# -- Without subtable detection, entire sheet is one table --
assert len(elements) == 1
assert isinstance(elements[0], Table)


@pytest.mark.parametrize("infer_table_structure", [True, False])
def test_partition_xlsm_infer_table_structure(infer_table_structure: bool):
"""Test that infer_table_structure parameter controls HTML table generation."""
elements = partition_xlsm(
"example-docs/stanley-cups.xlsm", infer_table_structure=infer_table_structure
)
table_elements = [e for e in elements if isinstance(e, Table)]
for table_element in table_elements:
table_element_has_text_as_html_field = (
hasattr(table_element.metadata, "text_as_html")
and table_element.metadata.text_as_html is not None
)
assert table_element_has_text_as_html_field == infer_table_structure


def test_partition_xlsm_with_header():
"""Test that partition_xlsm includes header when include_header=True."""
elements = partition_xlsm("example-docs/stanley-cups.xlsm", include_header=True)

assert len(elements) == 2
assert all(isinstance(e, Table) for e in elements)
e = elements[0]
# -- Header row is included in the text --
assert e.text == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
assert e.metadata.text_as_html is not None


def test_partition_xlsm_from_empty_file():
"""Test that partition_xlsm handles empty XLSM files gracefully."""
elements = partition_xlsm("example-docs/empty.xlsm")

# -- Empty file should return empty list or minimal elements --
assert isinstance(elements, list)


def test_partition_xlsm_metadata_page_numbers():
"""Test that page numbers are correctly assigned to elements from multiple sheets."""
elements = partition_xlsm("example-docs/multi-sheet-test.xlsm", starting_page_number=5)

# -- Page numbers should start from the specified starting_page_number --
page_numbers = [e.metadata.page_number for e in elements if hasattr(e.metadata, "page_number")]
assert min(page_numbers) >= 5


def test_partition_xlsm_raises_on_no_file_or_path():
"""Test that partition_xlsm raises ValueError when neither file nor filename is provided."""
with pytest.raises(ValueError, match="Either 'filename' or 'file' argument must be specif"):
partition_xlsm()


def test_partition_xlsm_serializable_to_json():
"""Test that elements from partition_xlsm can be serialized to JSON."""
elements = partition_xlsm("example-docs/stanley-cups.xlsm", include_header=False)

# -- Elements should be serializable to JSON --
assert_round_trips_through_JSON(elements)


# ------------------------------------------------------------------------------------------------
# FILE TYPE DETECTION TESTS
# ------------------------------------------------------------------------------------------------
# Test that XLSM files are correctly detected by the auto-detection system
# ------------------------------------------------------------------------------------------------


def test_auto_partition_xlsm_from_filename():
"""Test that partition() auto-detects XLSM files and uses partition_xlsm."""
from unstructured.partition.auto import partition

elements = partition("example-docs/stanley-cups.xlsm", include_header=False)

# -- Should successfully partition the file --
assert len(elements) > 0
# -- Should use XLSM MIME type --
assert elements[0].metadata.filetype == EXPECTED_FILETYPE_XLSM


def test_auto_partition_xlsm_from_file():
"""Test that partition() auto-detects XLSM files from file objects."""
from unstructured.partition.auto import partition

with open("example-docs/stanley-cups.xlsm", "rb") as f:
elements = partition(file=f, metadata_filename="test.xlsm", include_header=False)

# -- Should successfully partition the file --
assert len(elements) > 0
# -- Should use XLSM MIME type --
assert elements[0].metadata.filetype == EXPECTED_FILETYPE_XLSM
8 changes: 6 additions & 2 deletions unstructured/file_utils/filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -731,8 +731,8 @@ def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType | None:
def _file_type(self) -> FileType | None:
"""Differentiated file-type for a Zip archive.

Returns `FileType.DOCX`, `FileType.PPTX`, or `FileType.XLSX` when one of those applies,
`None` otherwise.
Returns `FileType.DOCX`, `FileType.PPTX`, `FileType.XLSX`, or `FileType.XLSM` when one of
those applies, `None` otherwise.
"""
if not self._ctx.is_zipfile:
return None
Expand All @@ -746,6 +746,10 @@ def _file_type(self) -> FileType | None:
return FileType.DOCX

if any(re.match(r"xl/workbook.*\.xml$", filename) for filename in filenames):
# -- Both XLSX and XLSM have the same internal structure (xl/workbook.xml).
# -- Distinguish them by file extension since XLSM is macro-enabled XLSX.
if self._ctx.extension == ".xlsm":
return FileType.XLSM
return FileType.XLSX

if any(re.match(r"ppt/presentation.*\.xml$", filename) for filename in filenames):
Expand Down
9 changes: 9 additions & 0 deletions unstructured/file_utils/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,15 @@ def partitioner_shortname(self) -> str | None:
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
cast(list[str], []),
)
XLSM = (
"xlsm",
"xlsx", # -- uses the same partitioner as XLSX --
["pandas", "openpyxl"],
"xlsx",
[".xlsm"],
"application/vnd.ms-excel.sheet.macroEnabled.12",
cast(list[str], []),
)
XML = ("xml", "xml", cast(list[str], []), None, [".xml"], "application/xml", ["text/xml"])
ZIP = ("zip", None, cast(list[str], []), None, [".zip"], "application/zip", cast(list[str], []))

Expand Down
53 changes: 53 additions & 0 deletions unstructured/partition/xlsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,3 +461,56 @@ def _get_metadata(
filename=opts.metadata_file_path,
last_modified=opts.last_modified,
)


@apply_metadata(FileType.XLSM)
@add_chunking_strategy
def partition_xlsm(
filename: Optional[str] = None,
*,
file: Optional[IO[bytes]] = None,
find_subtable: bool = True,
include_header: bool = False,
infer_table_structure: bool = True,
starting_page_number: int = 1,
**kwargs: Any,
) -> list[Element]:
"""Partitions Microsoft Excel Macro-Enabled Documents in .xlsm format into document elements.

XLSM files are Excel workbooks that can contain VBA macros. They have the same structure as
XLSX files (Office Open XML format) but with a different MIME type. This function extracts
only the data content; macros are not processed.

Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
find_subtable
Detect "subtables" on each worksheet and partition each of those as a separate `Table`
element. When `False`, each worksheet is partitioned as a single `Table` element. A
subtable is a contiguous block of cells with more than two cells in each row.
infer_table_structure
If True, any Table elements that are extracted will also have a metadata field
named "text_as_html" where the table's text content is rendered into an html string.
I.e., rows and cells are preserved.
Whether True or False, the "text" field is always present in any Table element
and is the text content of the table (no structure).
include_header
Determines whether or not header info is included in text and metadata.text_as_html
starting_page_number
The starting page number to assign to the first worksheet. Subsequent sheets are
numbered sequentially.
"""
# -- XLSM files have the same data structure as XLSX files, so we can use the same
# -- partitioning logic. The only difference is the file extension and MIME type.
return partition_xlsx(
filename=filename,
file=file,
find_subtable=find_subtable,
include_header=include_header,
infer_table_structure=infer_table_structure,
starting_page_number=starting_page_number,
**kwargs,
)