Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Release History

**Fixes**

- None
- Fixes `#70 <https://github.com/dfop02/html4docx/issues/70>`_: malformed table overflow. | `dfop02 <https://github.com/dfop02>`_

**New Features**

Expand Down
47 changes: 39 additions & 8 deletions html4docx/h4d.py
Original file line number Diff line number Diff line change
Expand Up @@ -1281,9 +1281,14 @@ def handle_table(self, current_attrs):
for cell_row, row in enumerate(self.get_table_rows(table_soup)):
col_offset = 0 # Shift index if some columns are occupied
for col in self.get_table_columns(row):
while used_cells[cell_row][col_offset]:
while col_offset < cols and used_cells[cell_row][col_offset]:
col_offset += 1

if col_offset >= cols:
raise ValueError(
f"Table layout mismatch: exceeded column count ({cols}) at row {cell_row}"
)

current_row = cell_row
current_col = col_offset

Expand Down Expand Up @@ -1782,20 +1787,46 @@ def get_table_dimensions(self, table_soup):

default_span = 1
max_cols = 0
max_rows = len(rows)

# Track occupied cells caused by rowspan
used_cells = []

for row_idx, row in enumerate(rows):
cols = self.get_table_columns(row)
# Handle colspan
row_col_count = sum(utils.safe_int(col.get('colspan', default_span)) for col in cols)
max_cols = max(max_cols, row_col_count)

# Handle rowspan
# Ensure used_cells has current row
while len(used_cells) <= row_idx:
used_cells.append([])

col_offset = 0

for col in cols:
# Expand row if needed
while len(used_cells[row_idx]) <= col_offset:
used_cells[row_idx].append(False)

# Skip already occupied cells
while col_offset < len(used_cells[row_idx]) and used_cells[row_idx][col_offset]:
col_offset += 1

rowspan = utils.safe_int(col.get('rowspan', default_span))
if rowspan > default_span:
max_rows = max(max_rows, row_idx + rowspan)
colspan = utils.safe_int(col.get('colspan', default_span))

# Mark occupied cells
for r in range(row_idx, row_idx + rowspan):
while len(used_cells) <= r:
used_cells.append([])

for c in range(col_offset, col_offset + colspan):
while len(used_cells[r]) <= c:
used_cells[r].append(False)
used_cells[r][c] = True

col_offset += colspan

max_cols = max(max_cols, len(used_cells[row_idx]))

max_rows = len(used_cells)
return max_rows, max_cols

def get_tables(self) -> None:
Expand Down
50 changes: 50 additions & 0 deletions tests/test_h4d.py
Original file line number Diff line number Diff line change
Expand Up @@ -2055,6 +2055,56 @@ def test_extreme_colspan_rowspan_cases(self):
except Exception as e:
self.fail(f"Processing extreme table failed with unexpected error: {e}")

def test_malformed_table_overflow(self):
"""Test table where rowspan or colspan causes column overflow beyond initial dimension calculation"""
self.document.add_heading('Test: Malformed Table Overflow', level=1)

malformed_html = """
<table>
<tr>
<td rowspan="2">spans down</td>
<td>B1</td>
</tr>
<tr>
<td>A2</td>
<td>B2</td>
</tr>
</table>
"""

try:
self.parser.table_style = 'Table Grid'
self.parser.add_html_to_document(malformed_html, self.document)
document = self.parser.parse_html_string(malformed_html)

tables = document.tables
assert len(tables) == 1, "Should create exactly one table"

table = tables[0]

assert len(table.columns) == 3, (
f"Expected 3 columns due to rowspan shift, but got {len(table.columns)}"
)

assert len(table.rows) == 2, (
f"Expected 2 rows, but got {len(table.rows)}"
)

# Validate content placement
assert "spans down" in table.cell(0, 0).text, "Rowspan cell not in correct position"
assert "B1" in table.cell(0, 1).text, "B1 should be in row 0, col 1"

# Second row:
# col 0 is occupied by rowspan
# so A2 → col 1, B2 → col 2
assert "A2" in table.cell(1, 1).text, "A2 should be shifted to column 1"
assert "B2" in table.cell(1, 2).text, "B2 should be in column 2"

except IndexError as e:
self.fail(f"Malformed table caused IndexError (regression): {e}")
except Exception as e:
self.fail(f"Malformed table failed with unexpected error: {e}")

def test_nested_styles_on_multiple_tags(self):
""" Test nested styles on multiple tags """
self.document.add_heading('Test: Test nested styles on multiple tags', level=1)
Expand Down
Loading