From 4d9080855c878c6d7752e6addeb3f1833a319c46 Mon Sep 17 00:00:00 2001 From: python Date: Thu, 11 Dec 2025 16:34:50 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=96=87=E6=A1=A3=E6=9C=8D?= =?UTF-8?q?=E5=8A=A1=E4=B8=AD=E7=9A=84=E8=A1=A8=E6=A0=BC=E5=A4=84=E7=90=86?= =?UTF-8?q?=E9=80=BB=E8=BE=91=EF=BC=8C=E4=BD=BF=E7=94=A8=E7=B4=A2=E5=BC=95?= =?UTF-8?q?=E6=96=B9=E5=BC=8F=E8=AE=BF=E9=97=AE=E8=A1=8C=E5=92=8C=E5=8D=95?= =?UTF-8?q?=E5=85=83=E6=A0=BC=E4=BB=A5=E9=81=BF=E5=85=8D=E8=BF=AD=E4=BB=A3?= =?UTF-8?q?=E6=97=B6=E7=9A=84=E7=B4=A2=E5=BC=95=E9=94=99=E8=AF=AF=EF=BC=8C?= =?UTF-8?q?=E5=90=8C=E6=97=B6=E5=A2=9E=E5=BC=BA=E5=AF=B9=E5=BC=82=E5=B8=B8?= =?UTF-8?q?=E6=83=85=E5=86=B5=E7=9A=84=E5=A4=84=E7=90=86=EF=BC=8C=E7=A1=AE?= =?UTF-8?q?=E4=BF=9D=E5=9C=A8=E8=AE=BF=E9=97=AE=E8=A1=8C=E3=80=81=E5=8D=95?= =?UTF-8?q?=E5=85=83=E6=A0=BC=E5=92=8C=E6=AE=B5=E8=90=BD=E6=97=B6=E7=9A=84?= =?UTF-8?q?=E7=A8=B3=E5=AE=9A=E6=80=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- services/document_service.py | 233 +++++++++++++++++++++++++++++++---- 1 file changed, 212 insertions(+), 21 deletions(-) diff --git a/services/document_service.py b/services/document_service.py index 0bff6b7..6f6a865 100644 --- a/services/document_service.py +++ b/services/document_service.py @@ -165,15 +165,42 @@ class DocumentService: print(f"[WARN] 扫描表格 {table_idx} 时无法获取行数,跳过该表格: {str(e)}") continue - for row_idx, row in enumerate(table.rows): + # 使用索引方式访问行,而不是迭代器,避免在迭代时触发内部索引访问错误 + try: + row_count = len(table.rows) + except Exception: + row_count = 0 + + for row_idx in range(row_count): try: + # 使用索引访问行,而不是迭代器 + row = table.rows[row_idx] + # 安全地访问 row.cells,避免 docx 库在处理异常表格结构时的 bug if not hasattr(row, 'cells'): continue # 使用 try-except 包裹,防止 IndexError try: - cells = row.cells + # 先尝试获取cells的数量 + try: + cell_count = len(row.cells) + except (IndexError, AttributeError): + cell_count = 0 + + if cell_count == 0: + continue + + # 使用索引方式访问cells,而不是迭代器 + cells = [] + for cell_idx in range(cell_count): + try: + cell = row.cells[cell_idx] + cells.append(cell) + except (IndexError, AttributeError): + # 如果某个单元格无法访问,跳过 + continue + if not cells: continue except (IndexError, AttributeError) as e: @@ -187,7 +214,19 @@ class DocumentService: # 安全地获取paragraphs列表 try: - paragraphs = list(cell.paragraphs) if cell.paragraphs else [] + # 先尝试获取paragraphs的数量 + try: + para_count = len(cell.paragraphs) + except (IndexError, AttributeError): + para_count = 0 + + paragraphs = [] + for para_idx in range(para_count): + try: + para = cell.paragraphs[para_idx] + paragraphs.append(para) + except (IndexError, AttributeError): + continue except (IndexError, AttributeError) as e: print(f"[WARN] 扫描表格 {table_idx} 行 {row_idx} 单元格 {cell_idx} 时无法访问段落,跳过: {str(e)}") continue @@ -447,16 +486,37 @@ class DocumentService: print(f"[WARN] 无法获取表格 {table_idx} 的行数,跳过该表格: {str(e)}") continue - for row_idx, row in enumerate(table.rows): + # 使用索引方式访问行,而不是迭代器,避免在迭代时触发内部索引访问错误 + for row_idx in range(row_count): try: + # 使用索引访问行,而不是迭代器 + row = table.rows[row_idx] + # 安全地访问 row.cells,避免 docx 库在处理异常表格结构时的 bug if not hasattr(row, 'cells'): continue # 使用 try-except 包裹,防止 IndexError try: - # 尝试获取cells,如果失败则跳过该行 - cells = row.cells + # 先尝试获取cells的数量 + try: + cell_count = len(row.cells) + except (IndexError, AttributeError): + cell_count = 0 + + if cell_count == 0: + continue + + # 使用索引方式访问cells,而不是迭代器 + cells = [] + for cell_idx in range(cell_count): + try: + cell = row.cells[cell_idx] + cells.append(cell) + except (IndexError, AttributeError): + # 如果某个单元格无法访问,跳过 + continue + if not cells: continue except (IndexError, AttributeError) as e: @@ -472,7 +532,19 @@ class DocumentService: # 安全地获取paragraphs列表 try: - paragraphs = list(cell.paragraphs) if cell.paragraphs else [] + # 先尝试获取paragraphs的数量 + try: + para_count = len(cell.paragraphs) + except (IndexError, AttributeError): + para_count = 0 + + paragraphs = [] + for para_idx in range(para_count): + try: + para = cell.paragraphs[para_idx] + paragraphs.append(para) + except (IndexError, AttributeError): + continue except (IndexError, AttributeError) as e: print(f"[WARN] 表格 {table_idx} 行 {row_idx} 单元格 {cell_idx} 无法访问段落,跳过: {str(e)}") continue @@ -532,22 +604,43 @@ class DocumentService: if not table.rows: continue - # 安全地获取表格行数 + # 安全地获取表格行数,使用索引方式访问行,而不是迭代器 try: row_count = len(table.rows) except Exception as e: - print(f"[WARN] 验证表格 {table_idx} 时无法获取行数,跳过该表格: {str(e)}") + print(f"[WARN] 保存前验证表格 {table_idx} 时无法获取行数,跳过该表格: {str(e)}") continue - for row_idx, row in enumerate(table.rows): + for row_idx in range(row_count): try: + # 使用索引访问行,而不是迭代器 + row = table.rows[row_idx] + # 安全地访问 row.cells,避免 docx 库在处理异常表格结构时的 bug if not hasattr(row, 'cells'): continue # 使用 try-except 包裹,防止 IndexError try: - cells = row.cells + # 先尝试获取cells的数量 + try: + cell_count = len(row.cells) + except (IndexError, AttributeError): + cell_count = 0 + + if cell_count == 0: + continue + + # 使用索引方式访问cells,而不是迭代器 + cells = [] + for cell_idx in range(cell_count): + try: + cell = row.cells[cell_idx] + cells.append(cell) + except (IndexError, AttributeError): + # 如果某个单元格无法访问,跳过 + continue + if not cells: continue except (IndexError, AttributeError) as e: @@ -561,7 +654,19 @@ class DocumentService: # 安全地获取paragraphs列表 try: - paragraphs = list(cell.paragraphs) if cell.paragraphs else [] + # 先尝试获取paragraphs的数量 + try: + para_count = len(cell.paragraphs) + except (IndexError, AttributeError): + para_count = 0 + + paragraphs = [] + for para_idx in range(para_count): + try: + para = cell.paragraphs[para_idx] + paragraphs.append(para) + except (IndexError, AttributeError): + continue except (IndexError, AttributeError) as e: print(f"[WARN] 验证表格 {table_idx} 行 {row_idx} 单元格 {cell_idx} 时无法访问段落,跳过: {str(e)}") continue @@ -613,28 +718,81 @@ class DocumentService: if field_code: verification_placeholders.add(field_code) - for table in doc.tables: + for table_idx, table in enumerate(doc.tables): try: if not table.rows: continue - for row in table.rows: + + # 安全地获取表格行数,使用索引方式访问行 + try: + row_count = len(table.rows) + except Exception: + continue + + for row_idx in range(row_count): try: + # 使用索引访问行,而不是迭代器 + row = table.rows[row_idx] + if not hasattr(row, 'cells'): continue + try: - cells = row.cells + # 先尝试获取cells的数量 + try: + cell_count = len(row.cells) + except (IndexError, AttributeError): + cell_count = 0 + + if cell_count == 0: + continue + + # 使用索引方式访问cells,而不是迭代器 + cells = [] + for cell_idx in range(cell_count): + try: + cell = row.cells[cell_idx] + cells.append(cell) + except (IndexError, AttributeError): + continue + + if not cells: + continue except (IndexError, AttributeError): continue - for cell in cells: + + for cell_idx, cell in enumerate(cells): try: - if hasattr(cell, 'paragraphs'): - for paragraph in cell.paragraphs: + if not hasattr(cell, 'paragraphs'): + continue + + # 安全地获取paragraphs列表 + try: + try: + para_count = len(cell.paragraphs) + except (IndexError, AttributeError): + para_count = 0 + + paragraphs = [] + for para_idx in range(para_count): + try: + para = cell.paragraphs[para_idx] + paragraphs.append(para) + except (IndexError, AttributeError): + continue + except (IndexError, AttributeError): + continue + + for paragraph in paragraphs: + try: text = paragraph.text matches = placeholder_pattern.findall(text) for match in matches: field_code = match.strip() if field_code: verification_placeholders.add(field_code) + except Exception: + continue except Exception: continue except Exception: @@ -688,13 +846,34 @@ class DocumentService: except Exception: continue - for row_idx, row in enumerate(table.rows): + # 使用索引方式访问行,而不是迭代器 + for row_idx in range(row_count): try: + # 使用索引访问行,而不是迭代器 + row = table.rows[row_idx] + if not hasattr(row, 'cells'): continue try: - cells = row.cells + # 先尝试获取cells的数量 + try: + cell_count = len(row.cells) + except (IndexError, AttributeError): + cell_count = 0 + + if cell_count == 0: + continue + + # 使用索引方式访问cells,而不是迭代器 + cells = [] + for cell_idx in range(cell_count): + try: + cell = row.cells[cell_idx] + cells.append(cell) + except (IndexError, AttributeError): + continue + if not cells: continue except (IndexError, AttributeError): @@ -707,7 +886,19 @@ class DocumentService: # 安全地获取paragraphs列表 try: - paragraphs = list(cell.paragraphs) if cell.paragraphs else [] + # 先尝试获取paragraphs的数量 + try: + para_count = len(cell.paragraphs) + except (IndexError, AttributeError): + para_count = 0 + + paragraphs = [] + for para_idx in range(para_count): + try: + para = cell.paragraphs[para_idx] + paragraphs.append(para) + except (IndexError, AttributeError): + continue except (IndexError, AttributeError): continue