diff --git a/HISTORY.rst b/HISTORY.rst index f595038..cbb1d97 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -12,7 +12,7 @@ Release History **Fixes** -- None +- Fixes `#70 `_: malformed table overflow. | `dfop02 `_ **New Features** diff --git a/html4docx/h4d.py b/html4docx/h4d.py index 85b992a..407fa9b 100644 --- a/html4docx/h4d.py +++ b/html4docx/h4d.py @@ -1281,9 +1281,14 @@ def handle_table(self, current_attrs): for cell_row, row in enumerate(self.get_table_rows(table_soup)): col_offset = 0 # Shift index if some columns are occupied for col in self.get_table_columns(row): - while used_cells[cell_row][col_offset]: + while col_offset < cols and used_cells[cell_row][col_offset]: col_offset += 1 + if col_offset >= cols: + raise ValueError( + f"Table layout mismatch: exceeded column count ({cols}) at row {cell_row}" + ) + current_row = cell_row current_col = col_offset @@ -1782,20 +1787,46 @@ def get_table_dimensions(self, table_soup): default_span = 1 max_cols = 0 - max_rows = len(rows) + + # Track occupied cells caused by rowspan + used_cells = [] for row_idx, row in enumerate(rows): cols = self.get_table_columns(row) - # Handle colspan - row_col_count = sum(utils.safe_int(col.get('colspan', default_span)) for col in cols) - max_cols = max(max_cols, row_col_count) - # Handle rowspan + # Ensure used_cells has current row + while len(used_cells) <= row_idx: + used_cells.append([]) + + col_offset = 0 + for col in cols: + # Expand row if needed + while len(used_cells[row_idx]) <= col_offset: + used_cells[row_idx].append(False) + + # Skip already occupied cells + while col_offset < len(used_cells[row_idx]) and used_cells[row_idx][col_offset]: + col_offset += 1 + rowspan = utils.safe_int(col.get('rowspan', default_span)) - if rowspan > default_span: - max_rows = max(max_rows, row_idx + rowspan) + colspan = utils.safe_int(col.get('colspan', default_span)) + + # Mark occupied cells + for r in range(row_idx, row_idx + rowspan): + while len(used_cells) <= r: + used_cells.append([]) + + for c in range(col_offset, col_offset + colspan): + while len(used_cells[r]) <= c: + used_cells[r].append(False) + used_cells[r][c] = True + + col_offset += colspan + + max_cols = max(max_cols, len(used_cells[row_idx])) + max_rows = len(used_cells) return max_rows, max_cols def get_tables(self) -> None: diff --git a/tests/test_h4d.py b/tests/test_h4d.py index 685df09..6aa4dcb 100644 --- a/tests/test_h4d.py +++ b/tests/test_h4d.py @@ -2055,6 +2055,56 @@ def test_extreme_colspan_rowspan_cases(self): except Exception as e: self.fail(f"Processing extreme table failed with unexpected error: {e}") + def test_malformed_table_overflow(self): + """Test table where rowspan or colspan causes column overflow beyond initial dimension calculation""" + self.document.add_heading('Test: Malformed Table Overflow', level=1) + + malformed_html = """ + + + + + + + + + +
spans downB1
A2B2
+ """ + + try: + self.parser.table_style = 'Table Grid' + self.parser.add_html_to_document(malformed_html, self.document) + document = self.parser.parse_html_string(malformed_html) + + tables = document.tables + assert len(tables) == 1, "Should create exactly one table" + + table = tables[0] + + assert len(table.columns) == 3, ( + f"Expected 3 columns due to rowspan shift, but got {len(table.columns)}" + ) + + assert len(table.rows) == 2, ( + f"Expected 2 rows, but got {len(table.rows)}" + ) + + # Validate content placement + assert "spans down" in table.cell(0, 0).text, "Rowspan cell not in correct position" + assert "B1" in table.cell(0, 1).text, "B1 should be in row 0, col 1" + + # Second row: + # col 0 is occupied by rowspan + # so A2 → col 1, B2 → col 2 + assert "A2" in table.cell(1, 1).text, "A2 should be shifted to column 1" + assert "B2" in table.cell(1, 2).text, "B2 should be in column 2" + + except IndexError as e: + self.fail(f"Malformed table caused IndexError (regression): {e}") + except Exception as e: + self.fail(f"Malformed table failed with unexpected error: {e}") + def test_nested_styles_on_multiple_tags(self): """ Test nested styles on multiple tags """ self.document.add_heading('Test: Test nested styles on multiple tags', level=1)