优化文档服务中的表格处理逻辑,使用索引方式访问行和单元格以避免迭代时的索引错误,同时增强对异常情况的处理,确保在访问行、单元格和段落时的稳定性

This commit is contained in:
python 2025-12-11 16:34:50 +08:00
parent 91fcd5461d
commit 4d9080855c

View File

@ -165,15 +165,42 @@ class DocumentService:
print(f"[WARN] 扫描表格 {table_idx} 时无法获取行数,跳过该表格: {str(e)}") print(f"[WARN] 扫描表格 {table_idx} 时无法获取行数,跳过该表格: {str(e)}")
continue continue
for row_idx, row in enumerate(table.rows): # 使用索引方式访问行,而不是迭代器,避免在迭代时触发内部索引访问错误
try:
row_count = len(table.rows)
except Exception:
row_count = 0
for row_idx in range(row_count):
try: try:
# 使用索引访问行,而不是迭代器
row = table.rows[row_idx]
# 安全地访问 row.cells避免 docx 库在处理异常表格结构时的 bug # 安全地访问 row.cells避免 docx 库在处理异常表格结构时的 bug
if not hasattr(row, 'cells'): if not hasattr(row, 'cells'):
continue continue
# 使用 try-except 包裹,防止 IndexError # 使用 try-except 包裹,防止 IndexError
try: try:
cells = row.cells # 先尝试获取cells的数量
try:
cell_count = len(row.cells)
except (IndexError, AttributeError):
cell_count = 0
if cell_count == 0:
continue
# 使用索引方式访问cells而不是迭代器
cells = []
for cell_idx in range(cell_count):
try:
cell = row.cells[cell_idx]
cells.append(cell)
except (IndexError, AttributeError):
# 如果某个单元格无法访问,跳过
continue
if not cells: if not cells:
continue continue
except (IndexError, AttributeError) as e: except (IndexError, AttributeError) as e:
@ -187,7 +214,19 @@ class DocumentService:
# 安全地获取paragraphs列表 # 安全地获取paragraphs列表
try: try:
paragraphs = list(cell.paragraphs) if cell.paragraphs else [] # 先尝试获取paragraphs的数量
try:
para_count = len(cell.paragraphs)
except (IndexError, AttributeError):
para_count = 0
paragraphs = []
for para_idx in range(para_count):
try:
para = cell.paragraphs[para_idx]
paragraphs.append(para)
except (IndexError, AttributeError):
continue
except (IndexError, AttributeError) as e: except (IndexError, AttributeError) as e:
print(f"[WARN] 扫描表格 {table_idx}{row_idx} 单元格 {cell_idx} 时无法访问段落,跳过: {str(e)}") print(f"[WARN] 扫描表格 {table_idx}{row_idx} 单元格 {cell_idx} 时无法访问段落,跳过: {str(e)}")
continue continue
@ -447,16 +486,37 @@ class DocumentService:
print(f"[WARN] 无法获取表格 {table_idx} 的行数,跳过该表格: {str(e)}") print(f"[WARN] 无法获取表格 {table_idx} 的行数,跳过该表格: {str(e)}")
continue continue
for row_idx, row in enumerate(table.rows): # 使用索引方式访问行,而不是迭代器,避免在迭代时触发内部索引访问错误
for row_idx in range(row_count):
try: try:
# 使用索引访问行,而不是迭代器
row = table.rows[row_idx]
# 安全地访问 row.cells避免 docx 库在处理异常表格结构时的 bug # 安全地访问 row.cells避免 docx 库在处理异常表格结构时的 bug
if not hasattr(row, 'cells'): if not hasattr(row, 'cells'):
continue continue
# 使用 try-except 包裹,防止 IndexError # 使用 try-except 包裹,防止 IndexError
try: try:
# 尝试获取cells如果失败则跳过该行 # 先尝试获取cells的数量
cells = row.cells try:
cell_count = len(row.cells)
except (IndexError, AttributeError):
cell_count = 0
if cell_count == 0:
continue
# 使用索引方式访问cells而不是迭代器
cells = []
for cell_idx in range(cell_count):
try:
cell = row.cells[cell_idx]
cells.append(cell)
except (IndexError, AttributeError):
# 如果某个单元格无法访问,跳过
continue
if not cells: if not cells:
continue continue
except (IndexError, AttributeError) as e: except (IndexError, AttributeError) as e:
@ -472,7 +532,19 @@ class DocumentService:
# 安全地获取paragraphs列表 # 安全地获取paragraphs列表
try: try:
paragraphs = list(cell.paragraphs) if cell.paragraphs else [] # 先尝试获取paragraphs的数量
try:
para_count = len(cell.paragraphs)
except (IndexError, AttributeError):
para_count = 0
paragraphs = []
for para_idx in range(para_count):
try:
para = cell.paragraphs[para_idx]
paragraphs.append(para)
except (IndexError, AttributeError):
continue
except (IndexError, AttributeError) as e: except (IndexError, AttributeError) as e:
print(f"[WARN] 表格 {table_idx}{row_idx} 单元格 {cell_idx} 无法访问段落,跳过: {str(e)}") print(f"[WARN] 表格 {table_idx}{row_idx} 单元格 {cell_idx} 无法访问段落,跳过: {str(e)}")
continue continue
@ -532,22 +604,43 @@ class DocumentService:
if not table.rows: if not table.rows:
continue continue
# 安全地获取表格行数 # 安全地获取表格行数,使用索引方式访问行,而不是迭代器
try: try:
row_count = len(table.rows) row_count = len(table.rows)
except Exception as e: except Exception as e:
print(f"[WARN] 验证表格 {table_idx} 时无法获取行数,跳过该表格: {str(e)}") print(f"[WARN] 保存前验证表格 {table_idx} 时无法获取行数,跳过该表格: {str(e)}")
continue continue
for row_idx, row in enumerate(table.rows): for row_idx in range(row_count):
try: try:
# 使用索引访问行,而不是迭代器
row = table.rows[row_idx]
# 安全地访问 row.cells避免 docx 库在处理异常表格结构时的 bug # 安全地访问 row.cells避免 docx 库在处理异常表格结构时的 bug
if not hasattr(row, 'cells'): if not hasattr(row, 'cells'):
continue continue
# 使用 try-except 包裹,防止 IndexError # 使用 try-except 包裹,防止 IndexError
try: try:
cells = row.cells # 先尝试获取cells的数量
try:
cell_count = len(row.cells)
except (IndexError, AttributeError):
cell_count = 0
if cell_count == 0:
continue
# 使用索引方式访问cells而不是迭代器
cells = []
for cell_idx in range(cell_count):
try:
cell = row.cells[cell_idx]
cells.append(cell)
except (IndexError, AttributeError):
# 如果某个单元格无法访问,跳过
continue
if not cells: if not cells:
continue continue
except (IndexError, AttributeError) as e: except (IndexError, AttributeError) as e:
@ -561,7 +654,19 @@ class DocumentService:
# 安全地获取paragraphs列表 # 安全地获取paragraphs列表
try: try:
paragraphs = list(cell.paragraphs) if cell.paragraphs else [] # 先尝试获取paragraphs的数量
try:
para_count = len(cell.paragraphs)
except (IndexError, AttributeError):
para_count = 0
paragraphs = []
for para_idx in range(para_count):
try:
para = cell.paragraphs[para_idx]
paragraphs.append(para)
except (IndexError, AttributeError):
continue
except (IndexError, AttributeError) as e: except (IndexError, AttributeError) as e:
print(f"[WARN] 验证表格 {table_idx}{row_idx} 单元格 {cell_idx} 时无法访问段落,跳过: {str(e)}") print(f"[WARN] 验证表格 {table_idx}{row_idx} 单元格 {cell_idx} 时无法访问段落,跳过: {str(e)}")
continue continue
@ -613,28 +718,81 @@ class DocumentService:
if field_code: if field_code:
verification_placeholders.add(field_code) verification_placeholders.add(field_code)
for table in doc.tables: for table_idx, table in enumerate(doc.tables):
try: try:
if not table.rows: if not table.rows:
continue continue
for row in table.rows:
# 安全地获取表格行数,使用索引方式访问行
try:
row_count = len(table.rows)
except Exception:
continue
for row_idx in range(row_count):
try: try:
# 使用索引访问行,而不是迭代器
row = table.rows[row_idx]
if not hasattr(row, 'cells'): if not hasattr(row, 'cells'):
continue continue
try: try:
cells = row.cells # 先尝试获取cells的数量
try:
cell_count = len(row.cells)
except (IndexError, AttributeError):
cell_count = 0
if cell_count == 0:
continue
# 使用索引方式访问cells而不是迭代器
cells = []
for cell_idx in range(cell_count):
try:
cell = row.cells[cell_idx]
cells.append(cell)
except (IndexError, AttributeError):
continue
if not cells:
continue
except (IndexError, AttributeError): except (IndexError, AttributeError):
continue continue
for cell in cells:
for cell_idx, cell in enumerate(cells):
try: try:
if hasattr(cell, 'paragraphs'): if not hasattr(cell, 'paragraphs'):
for paragraph in cell.paragraphs: continue
# 安全地获取paragraphs列表
try:
try:
para_count = len(cell.paragraphs)
except (IndexError, AttributeError):
para_count = 0
paragraphs = []
for para_idx in range(para_count):
try:
para = cell.paragraphs[para_idx]
paragraphs.append(para)
except (IndexError, AttributeError):
continue
except (IndexError, AttributeError):
continue
for paragraph in paragraphs:
try:
text = paragraph.text text = paragraph.text
matches = placeholder_pattern.findall(text) matches = placeholder_pattern.findall(text)
for match in matches: for match in matches:
field_code = match.strip() field_code = match.strip()
if field_code: if field_code:
verification_placeholders.add(field_code) verification_placeholders.add(field_code)
except Exception:
continue
except Exception: except Exception:
continue continue
except Exception: except Exception:
@ -688,13 +846,34 @@ class DocumentService:
except Exception: except Exception:
continue continue
for row_idx, row in enumerate(table.rows): # 使用索引方式访问行,而不是迭代器
for row_idx in range(row_count):
try: try:
# 使用索引访问行,而不是迭代器
row = table.rows[row_idx]
if not hasattr(row, 'cells'): if not hasattr(row, 'cells'):
continue continue
try: try:
cells = row.cells # 先尝试获取cells的数量
try:
cell_count = len(row.cells)
except (IndexError, AttributeError):
cell_count = 0
if cell_count == 0:
continue
# 使用索引方式访问cells而不是迭代器
cells = []
for cell_idx in range(cell_count):
try:
cell = row.cells[cell_idx]
cells.append(cell)
except (IndexError, AttributeError):
continue
if not cells: if not cells:
continue continue
except (IndexError, AttributeError): except (IndexError, AttributeError):
@ -707,7 +886,19 @@ class DocumentService:
# 安全地获取paragraphs列表 # 安全地获取paragraphs列表
try: try:
paragraphs = list(cell.paragraphs) if cell.paragraphs else [] # 先尝试获取paragraphs的数量
try:
para_count = len(cell.paragraphs)
except (IndexError, AttributeError):
para_count = 0
paragraphs = []
for para_idx in range(para_count):
try:
para = cell.paragraphs[para_idx]
paragraphs.append(para)
except (IndexError, AttributeError):
continue
except (IndexError, AttributeError): except (IndexError, AttributeError):
continue continue