diff --git a/services/document_service.py b/services/document_service.py index 0bff6b7..6f6a865 100644 --- a/services/document_service.py +++ b/services/document_service.py @@ -165,15 +165,42 @@ class DocumentService: print(f"[WARN] 扫描表格 {table_idx} 时无法获取行数,跳过该表格: {str(e)}") continue - for row_idx, row in enumerate(table.rows): + # 使用索引方式访问行,而不是迭代器,避免在迭代时触发内部索引访问错误 + try: + row_count = len(table.rows) + except Exception: + row_count = 0 + + for row_idx in range(row_count): try: + # 使用索引访问行,而不是迭代器 + row = table.rows[row_idx] + # 安全地访问 row.cells,避免 docx 库在处理异常表格结构时的 bug if not hasattr(row, 'cells'): continue # 使用 try-except 包裹,防止 IndexError try: - cells = row.cells + # 先尝试获取cells的数量 + try: + cell_count = len(row.cells) + except (IndexError, AttributeError): + cell_count = 0 + + if cell_count == 0: + continue + + # 使用索引方式访问cells,而不是迭代器 + cells = [] + for cell_idx in range(cell_count): + try: + cell = row.cells[cell_idx] + cells.append(cell) + except (IndexError, AttributeError): + # 如果某个单元格无法访问,跳过 + continue + if not cells: continue except (IndexError, AttributeError) as e: @@ -187,7 +214,19 @@ class DocumentService: # 安全地获取paragraphs列表 try: - paragraphs = list(cell.paragraphs) if cell.paragraphs else [] + # 先尝试获取paragraphs的数量 + try: + para_count = len(cell.paragraphs) + except (IndexError, AttributeError): + para_count = 0 + + paragraphs = [] + for para_idx in range(para_count): + try: + para = cell.paragraphs[para_idx] + paragraphs.append(para) + except (IndexError, AttributeError): + continue except (IndexError, AttributeError) as e: print(f"[WARN] 扫描表格 {table_idx} 行 {row_idx} 单元格 {cell_idx} 时无法访问段落,跳过: {str(e)}") continue @@ -447,16 +486,37 @@ class DocumentService: print(f"[WARN] 无法获取表格 {table_idx} 的行数,跳过该表格: {str(e)}") continue - for row_idx, row in enumerate(table.rows): + # 使用索引方式访问行,而不是迭代器,避免在迭代时触发内部索引访问错误 + for row_idx in range(row_count): try: + # 使用索引访问行,而不是迭代器 + row = table.rows[row_idx] + # 安全地访问 row.cells,避免 docx 库在处理异常表格结构时的 bug if not hasattr(row, 'cells'): continue # 使用 try-except 包裹,防止 IndexError try: - # 尝试获取cells,如果失败则跳过该行 - cells = row.cells + # 先尝试获取cells的数量 + try: + cell_count = len(row.cells) + except (IndexError, AttributeError): + cell_count = 0 + + if cell_count == 0: + continue + + # 使用索引方式访问cells,而不是迭代器 + cells = [] + for cell_idx in range(cell_count): + try: + cell = row.cells[cell_idx] + cells.append(cell) + except (IndexError, AttributeError): + # 如果某个单元格无法访问,跳过 + continue + if not cells: continue except (IndexError, AttributeError) as e: @@ -472,7 +532,19 @@ class DocumentService: # 安全地获取paragraphs列表 try: - paragraphs = list(cell.paragraphs) if cell.paragraphs else [] + # 先尝试获取paragraphs的数量 + try: + para_count = len(cell.paragraphs) + except (IndexError, AttributeError): + para_count = 0 + + paragraphs = [] + for para_idx in range(para_count): + try: + para = cell.paragraphs[para_idx] + paragraphs.append(para) + except (IndexError, AttributeError): + continue except (IndexError, AttributeError) as e: print(f"[WARN] 表格 {table_idx} 行 {row_idx} 单元格 {cell_idx} 无法访问段落,跳过: {str(e)}") continue @@ -532,22 +604,43 @@ class DocumentService: if not table.rows: continue - # 安全地获取表格行数 + # 安全地获取表格行数,使用索引方式访问行,而不是迭代器 try: row_count = len(table.rows) except Exception as e: - print(f"[WARN] 验证表格 {table_idx} 时无法获取行数,跳过该表格: {str(e)}") + print(f"[WARN] 保存前验证表格 {table_idx} 时无法获取行数,跳过该表格: {str(e)}") continue - for row_idx, row in enumerate(table.rows): + for row_idx in range(row_count): try: + # 使用索引访问行,而不是迭代器 + row = table.rows[row_idx] + # 安全地访问 row.cells,避免 docx 库在处理异常表格结构时的 bug if not hasattr(row, 'cells'): continue # 使用 try-except 包裹,防止 IndexError try: - cells = row.cells + # 先尝试获取cells的数量 + try: + cell_count = len(row.cells) + except (IndexError, AttributeError): + cell_count = 0 + + if cell_count == 0: + continue + + # 使用索引方式访问cells,而不是迭代器 + cells = [] + for cell_idx in range(cell_count): + try: + cell = row.cells[cell_idx] + cells.append(cell) + except (IndexError, AttributeError): + # 如果某个单元格无法访问,跳过 + continue + if not cells: continue except (IndexError, AttributeError) as e: @@ -561,7 +654,19 @@ class DocumentService: # 安全地获取paragraphs列表 try: - paragraphs = list(cell.paragraphs) if cell.paragraphs else [] + # 先尝试获取paragraphs的数量 + try: + para_count = len(cell.paragraphs) + except (IndexError, AttributeError): + para_count = 0 + + paragraphs = [] + for para_idx in range(para_count): + try: + para = cell.paragraphs[para_idx] + paragraphs.append(para) + except (IndexError, AttributeError): + continue except (IndexError, AttributeError) as e: print(f"[WARN] 验证表格 {table_idx} 行 {row_idx} 单元格 {cell_idx} 时无法访问段落,跳过: {str(e)}") continue @@ -613,28 +718,81 @@ class DocumentService: if field_code: verification_placeholders.add(field_code) - for table in doc.tables: + for table_idx, table in enumerate(doc.tables): try: if not table.rows: continue - for row in table.rows: + + # 安全地获取表格行数,使用索引方式访问行 + try: + row_count = len(table.rows) + except Exception: + continue + + for row_idx in range(row_count): try: + # 使用索引访问行,而不是迭代器 + row = table.rows[row_idx] + if not hasattr(row, 'cells'): continue + try: - cells = row.cells + # 先尝试获取cells的数量 + try: + cell_count = len(row.cells) + except (IndexError, AttributeError): + cell_count = 0 + + if cell_count == 0: + continue + + # 使用索引方式访问cells,而不是迭代器 + cells = [] + for cell_idx in range(cell_count): + try: + cell = row.cells[cell_idx] + cells.append(cell) + except (IndexError, AttributeError): + continue + + if not cells: + continue except (IndexError, AttributeError): continue - for cell in cells: + + for cell_idx, cell in enumerate(cells): try: - if hasattr(cell, 'paragraphs'): - for paragraph in cell.paragraphs: + if not hasattr(cell, 'paragraphs'): + continue + + # 安全地获取paragraphs列表 + try: + try: + para_count = len(cell.paragraphs) + except (IndexError, AttributeError): + para_count = 0 + + paragraphs = [] + for para_idx in range(para_count): + try: + para = cell.paragraphs[para_idx] + paragraphs.append(para) + except (IndexError, AttributeError): + continue + except (IndexError, AttributeError): + continue + + for paragraph in paragraphs: + try: text = paragraph.text matches = placeholder_pattern.findall(text) for match in matches: field_code = match.strip() if field_code: verification_placeholders.add(field_code) + except Exception: + continue except Exception: continue except Exception: @@ -688,13 +846,34 @@ class DocumentService: except Exception: continue - for row_idx, row in enumerate(table.rows): + # 使用索引方式访问行,而不是迭代器 + for row_idx in range(row_count): try: + # 使用索引访问行,而不是迭代器 + row = table.rows[row_idx] + if not hasattr(row, 'cells'): continue try: - cells = row.cells + # 先尝试获取cells的数量 + try: + cell_count = len(row.cells) + except (IndexError, AttributeError): + cell_count = 0 + + if cell_count == 0: + continue + + # 使用索引方式访问cells,而不是迭代器 + cells = [] + for cell_idx in range(cell_count): + try: + cell = row.cells[cell_idx] + cells.append(cell) + except (IndexError, AttributeError): + continue + if not cells: continue except (IndexError, AttributeError): @@ -707,7 +886,19 @@ class DocumentService: # 安全地获取paragraphs列表 try: - paragraphs = list(cell.paragraphs) if cell.paragraphs else [] + # 先尝试获取paragraphs的数量 + try: + para_count = len(cell.paragraphs) + except (IndexError, AttributeError): + para_count = 0 + + paragraphs = [] + for para_idx in range(para_count): + try: + para = cell.paragraphs[para_idx] + paragraphs.append(para) + except (IndexError, AttributeError): + continue except (IndexError, AttributeError): continue