优化文档服务中的表格处理逻辑,使用索引方式访问行和单元格以避免迭代时的索引错误,同时增强对异常情况的处理,确保在访问行、单元格和段落时的稳定性

This commit is contained in:
python 2025-12-11 16:34:50 +08:00
parent 91fcd5461d
commit 4d9080855c

View File

@ -165,15 +165,42 @@ class DocumentService:
print(f"[WARN] 扫描表格 {table_idx} 时无法获取行数,跳过该表格: {str(e)}")
continue
for row_idx, row in enumerate(table.rows):
# 使用索引方式访问行,而不是迭代器,避免在迭代时触发内部索引访问错误
try:
row_count = len(table.rows)
except Exception:
row_count = 0
for row_idx in range(row_count):
try:
# 使用索引访问行,而不是迭代器
row = table.rows[row_idx]
# 安全地访问 row.cells避免 docx 库在处理异常表格结构时的 bug
if not hasattr(row, 'cells'):
continue
# 使用 try-except 包裹,防止 IndexError
try:
cells = row.cells
# 先尝试获取cells的数量
try:
cell_count = len(row.cells)
except (IndexError, AttributeError):
cell_count = 0
if cell_count == 0:
continue
# 使用索引方式访问cells而不是迭代器
cells = []
for cell_idx in range(cell_count):
try:
cell = row.cells[cell_idx]
cells.append(cell)
except (IndexError, AttributeError):
# 如果某个单元格无法访问,跳过
continue
if not cells:
continue
except (IndexError, AttributeError) as e:
@ -187,7 +214,19 @@ class DocumentService:
# 安全地获取paragraphs列表
try:
paragraphs = list(cell.paragraphs) if cell.paragraphs else []
# 先尝试获取paragraphs的数量
try:
para_count = len(cell.paragraphs)
except (IndexError, AttributeError):
para_count = 0
paragraphs = []
for para_idx in range(para_count):
try:
para = cell.paragraphs[para_idx]
paragraphs.append(para)
except (IndexError, AttributeError):
continue
except (IndexError, AttributeError) as e:
print(f"[WARN] 扫描表格 {table_idx}{row_idx} 单元格 {cell_idx} 时无法访问段落,跳过: {str(e)}")
continue
@ -447,16 +486,37 @@ class DocumentService:
print(f"[WARN] 无法获取表格 {table_idx} 的行数,跳过该表格: {str(e)}")
continue
for row_idx, row in enumerate(table.rows):
# 使用索引方式访问行,而不是迭代器,避免在迭代时触发内部索引访问错误
for row_idx in range(row_count):
try:
# 使用索引访问行,而不是迭代器
row = table.rows[row_idx]
# 安全地访问 row.cells避免 docx 库在处理异常表格结构时的 bug
if not hasattr(row, 'cells'):
continue
# 使用 try-except 包裹,防止 IndexError
try:
# 尝试获取cells如果失败则跳过该行
cells = row.cells
# 先尝试获取cells的数量
try:
cell_count = len(row.cells)
except (IndexError, AttributeError):
cell_count = 0
if cell_count == 0:
continue
# 使用索引方式访问cells而不是迭代器
cells = []
for cell_idx in range(cell_count):
try:
cell = row.cells[cell_idx]
cells.append(cell)
except (IndexError, AttributeError):
# 如果某个单元格无法访问,跳过
continue
if not cells:
continue
except (IndexError, AttributeError) as e:
@ -472,7 +532,19 @@ class DocumentService:
# 安全地获取paragraphs列表
try:
paragraphs = list(cell.paragraphs) if cell.paragraphs else []
# 先尝试获取paragraphs的数量
try:
para_count = len(cell.paragraphs)
except (IndexError, AttributeError):
para_count = 0
paragraphs = []
for para_idx in range(para_count):
try:
para = cell.paragraphs[para_idx]
paragraphs.append(para)
except (IndexError, AttributeError):
continue
except (IndexError, AttributeError) as e:
print(f"[WARN] 表格 {table_idx}{row_idx} 单元格 {cell_idx} 无法访问段落,跳过: {str(e)}")
continue
@ -532,22 +604,43 @@ class DocumentService:
if not table.rows:
continue
# 安全地获取表格行数
# 安全地获取表格行数,使用索引方式访问行,而不是迭代器
try:
row_count = len(table.rows)
except Exception as e:
print(f"[WARN] 验证表格 {table_idx} 时无法获取行数,跳过该表格: {str(e)}")
print(f"[WARN] 保存前验证表格 {table_idx} 时无法获取行数,跳过该表格: {str(e)}")
continue
for row_idx, row in enumerate(table.rows):
for row_idx in range(row_count):
try:
# 使用索引访问行,而不是迭代器
row = table.rows[row_idx]
# 安全地访问 row.cells避免 docx 库在处理异常表格结构时的 bug
if not hasattr(row, 'cells'):
continue
# 使用 try-except 包裹,防止 IndexError
try:
cells = row.cells
# 先尝试获取cells的数量
try:
cell_count = len(row.cells)
except (IndexError, AttributeError):
cell_count = 0
if cell_count == 0:
continue
# 使用索引方式访问cells而不是迭代器
cells = []
for cell_idx in range(cell_count):
try:
cell = row.cells[cell_idx]
cells.append(cell)
except (IndexError, AttributeError):
# 如果某个单元格无法访问,跳过
continue
if not cells:
continue
except (IndexError, AttributeError) as e:
@ -561,7 +654,19 @@ class DocumentService:
# 安全地获取paragraphs列表
try:
paragraphs = list(cell.paragraphs) if cell.paragraphs else []
# 先尝试获取paragraphs的数量
try:
para_count = len(cell.paragraphs)
except (IndexError, AttributeError):
para_count = 0
paragraphs = []
for para_idx in range(para_count):
try:
para = cell.paragraphs[para_idx]
paragraphs.append(para)
except (IndexError, AttributeError):
continue
except (IndexError, AttributeError) as e:
print(f"[WARN] 验证表格 {table_idx}{row_idx} 单元格 {cell_idx} 时无法访问段落,跳过: {str(e)}")
continue
@ -613,28 +718,81 @@ class DocumentService:
if field_code:
verification_placeholders.add(field_code)
for table in doc.tables:
for table_idx, table in enumerate(doc.tables):
try:
if not table.rows:
continue
for row in table.rows:
# 安全地获取表格行数,使用索引方式访问行
try:
row_count = len(table.rows)
except Exception:
continue
for row_idx in range(row_count):
try:
# 使用索引访问行,而不是迭代器
row = table.rows[row_idx]
if not hasattr(row, 'cells'):
continue
try:
cells = row.cells
# 先尝试获取cells的数量
try:
cell_count = len(row.cells)
except (IndexError, AttributeError):
cell_count = 0
if cell_count == 0:
continue
# 使用索引方式访问cells而不是迭代器
cells = []
for cell_idx in range(cell_count):
try:
cell = row.cells[cell_idx]
cells.append(cell)
except (IndexError, AttributeError):
continue
if not cells:
continue
except (IndexError, AttributeError):
continue
for cell in cells:
for cell_idx, cell in enumerate(cells):
try:
if hasattr(cell, 'paragraphs'):
for paragraph in cell.paragraphs:
if not hasattr(cell, 'paragraphs'):
continue
# 安全地获取paragraphs列表
try:
try:
para_count = len(cell.paragraphs)
except (IndexError, AttributeError):
para_count = 0
paragraphs = []
for para_idx in range(para_count):
try:
para = cell.paragraphs[para_idx]
paragraphs.append(para)
except (IndexError, AttributeError):
continue
except (IndexError, AttributeError):
continue
for paragraph in paragraphs:
try:
text = paragraph.text
matches = placeholder_pattern.findall(text)
for match in matches:
field_code = match.strip()
if field_code:
verification_placeholders.add(field_code)
except Exception:
continue
except Exception:
continue
except Exception:
@ -688,13 +846,34 @@ class DocumentService:
except Exception:
continue
for row_idx, row in enumerate(table.rows):
# 使用索引方式访问行,而不是迭代器
for row_idx in range(row_count):
try:
# 使用索引访问行,而不是迭代器
row = table.rows[row_idx]
if not hasattr(row, 'cells'):
continue
try:
cells = row.cells
# 先尝试获取cells的数量
try:
cell_count = len(row.cells)
except (IndexError, AttributeError):
cell_count = 0
if cell_count == 0:
continue
# 使用索引方式访问cells而不是迭代器
cells = []
for cell_idx in range(cell_count):
try:
cell = row.cells[cell_idx]
cells.append(cell)
except (IndexError, AttributeError):
continue
if not cells:
continue
except (IndexError, AttributeError):
@ -707,7 +886,19 @@ class DocumentService:
# 安全地获取paragraphs列表
try:
paragraphs = list(cell.paragraphs) if cell.paragraphs else []
# 先尝试获取paragraphs的数量
try:
para_count = len(cell.paragraphs)
except (IndexError, AttributeError):
para_count = 0
paragraphs = []
for para_idx in range(para_count):
try:
para = cell.paragraphs[para_idx]
paragraphs.append(para)
except (IndexError, AttributeError):
continue
except (IndexError, AttributeError):
continue