优化文档服务中的表格处理逻辑，使用索引方式访问行和单元格以避免迭代时的索引错误，同时增强对异常情况的处理，确保在访问行、单元格和段落时的稳定性

2025-12-11 16:34:50 +08:00 · 2025-12-11 16:34:50 +08:00 · 4d9080855c
commit 4d9080855c
parent 91fcd5461d
1 changed files with 212 additions and 21 deletions
--- a/services/document_service.py
+++ b/services/document_service.py
@ -165,15 +165,42 @@ class DocumentService:
                        print(f"[WARN] 扫描表格 {table_idx} 时无法获取行数，跳过该表格: {str(e)}")
                        continue
                    
-                    for row_idx, row in enumerate(table.rows):
+                    # 使用索引方式访问行，而不是迭代器，避免在迭代时触发内部索引访问错误
+                    try:
+                        row_count = len(table.rows)
+                    except Exception:
+                        row_count = 0
+                    
+                    for row_idx in range(row_count):
                        try:
+                            # 使用索引访问行，而不是迭代器
+                            row = table.rows[row_idx]
+                            
                            # 安全地访问 row.cells，避免 docx 库在处理异常表格结构时的 bug
                            if not hasattr(row, 'cells'):
                                continue
                            
                            # 使用 try-except 包裹，防止 IndexError
                            try:
-                                cells = row.cells
+                                # 先尝试获取cells的数量
+                                try:
+                                    cell_count = len(row.cells)
+                                except (IndexError, AttributeError):
+                                    cell_count = 0
+                                
+                                if cell_count == 0:
+                                    continue
+                                
+                                # 使用索引方式访问cells，而不是迭代器
+                                cells = []
+                                for cell_idx in range(cell_count):
+                                    try:
+                                        cell = row.cells[cell_idx]
+                                        cells.append(cell)
+                                    except (IndexError, AttributeError):
+                                        # 如果某个单元格无法访问，跳过
+                                        continue
+                                
                                if not cells:
                                    continue
                            except (IndexError, AttributeError) as e:
@ -187,7 +214,19 @@ class DocumentService:
                                    
                                    # 安全地获取paragraphs列表
                                    try:
-                                        paragraphs = list(cell.paragraphs) if cell.paragraphs else []
+                                        # 先尝试获取paragraphs的数量
+                                        try:
+                                            para_count = len(cell.paragraphs)
+                                        except (IndexError, AttributeError):
+                                            para_count = 0
+                                        
+                                        paragraphs = []
+                                        for para_idx in range(para_count):
+                                            try:
+                                                para = cell.paragraphs[para_idx]
+                                                paragraphs.append(para)
+                                            except (IndexError, AttributeError):
+                                                continue
                                    except (IndexError, AttributeError) as e:
                                        print(f"[WARN] 扫描表格 {table_idx} 行 {row_idx} 单元格 {cell_idx} 时无法访问段落，跳过: {str(e)}")
                                        continue
@ -447,16 +486,37 @@ class DocumentService:
                            print(f"[WARN] 无法获取表格 {table_idx} 的行数，跳过该表格: {str(e)}")
                            continue
                        
-                        for row_idx, row in enumerate(table.rows):
+                        # 使用索引方式访问行，而不是迭代器，避免在迭代时触发内部索引访问错误
+                        for row_idx in range(row_count):
                            try:
+                                # 使用索引访问行，而不是迭代器
+                                row = table.rows[row_idx]
+                                
                                # 安全地访问 row.cells，避免 docx 库在处理异常表格结构时的 bug
                                if not hasattr(row, 'cells'):
                                    continue
                                
                                # 使用 try-except 包裹，防止 IndexError
                                try:
-                                    # 尝试获取cells，如果失败则跳过该行
-                                    cells = row.cells
+                                    # 先尝试获取cells的数量
+                                    try:
+                                        cell_count = len(row.cells)
+                                    except (IndexError, AttributeError):
+                                        cell_count = 0
+                                    
+                                    if cell_count == 0:
+                                        continue
+                                    
+                                    # 使用索引方式访问cells，而不是迭代器
+                                    cells = []
+                                    for cell_idx in range(cell_count):
+                                        try:
+                                            cell = row.cells[cell_idx]
+                                            cells.append(cell)
+                                        except (IndexError, AttributeError):
+                                            # 如果某个单元格无法访问，跳过
+                                            continue
+                                    
                                    if not cells:
                                        continue
                                except (IndexError, AttributeError) as e:
@ -472,7 +532,19 @@ class DocumentService:
                                        
                                        # 安全地获取paragraphs列表
                                        try:
-                                            paragraphs = list(cell.paragraphs) if cell.paragraphs else []
+                                            # 先尝试获取paragraphs的数量
+                                            try:
+                                                para_count = len(cell.paragraphs)
+                                            except (IndexError, AttributeError):
+                                                para_count = 0
+                                            
+                                            paragraphs = []
+                                            for para_idx in range(para_count):
+                                                try:
+                                                    para = cell.paragraphs[para_idx]
+                                                    paragraphs.append(para)
+                                                except (IndexError, AttributeError):
+                                                    continue
                                        except (IndexError, AttributeError) as e:
                                            print(f"[WARN] 表格 {table_idx} 行 {row_idx} 单元格 {cell_idx} 无法访问段落，跳过: {str(e)}")
                                            continue
@ -532,22 +604,43 @@ class DocumentService:
                    if not table.rows:
                        continue
                    
-                    # 安全地获取表格行数
+                    # 安全地获取表格行数，使用索引方式访问行，而不是迭代器
                    try:
                        row_count = len(table.rows)
                    except Exception as e:
-                        print(f"[WARN] 验证表格 {table_idx} 时无法获取行数，跳过该表格: {str(e)}")
+                        print(f"[WARN] 保存前验证表格 {table_idx} 时无法获取行数，跳过该表格: {str(e)}")
                        continue
                    
-                    for row_idx, row in enumerate(table.rows):
+                    for row_idx in range(row_count):
                        try:
+                            # 使用索引访问行，而不是迭代器
+                            row = table.rows[row_idx]
+                            
                            # 安全地访问 row.cells，避免 docx 库在处理异常表格结构时的 bug
                            if not hasattr(row, 'cells'):
                                continue
                            
                            # 使用 try-except 包裹，防止 IndexError
                            try:
-                                cells = row.cells
+                                # 先尝试获取cells的数量
+                                try:
+                                    cell_count = len(row.cells)
+                                except (IndexError, AttributeError):
+                                    cell_count = 0
+                                
+                                if cell_count == 0:
+                                    continue
+                                
+                                # 使用索引方式访问cells，而不是迭代器
+                                cells = []
+                                for cell_idx in range(cell_count):
+                                    try:
+                                        cell = row.cells[cell_idx]
+                                        cells.append(cell)
+                                    except (IndexError, AttributeError):
+                                        # 如果某个单元格无法访问，跳过
+                                        continue
+                                
                                if not cells:
                                    continue
                            except (IndexError, AttributeError) as e:
@ -561,7 +654,19 @@ class DocumentService:
                                    
                                    # 安全地获取paragraphs列表
                                    try:
-                                        paragraphs = list(cell.paragraphs) if cell.paragraphs else []
+                                        # 先尝试获取paragraphs的数量
+                                        try:
+                                            para_count = len(cell.paragraphs)
+                                        except (IndexError, AttributeError):
+                                            para_count = 0
+                                        
+                                        paragraphs = []
+                                        for para_idx in range(para_count):
+                                            try:
+                                                para = cell.paragraphs[para_idx]
+                                                paragraphs.append(para)
+                                            except (IndexError, AttributeError):
+                                                continue
                                    except (IndexError, AttributeError) as e:
                                        print(f"[WARN] 验证表格 {table_idx} 行 {row_idx} 单元格 {cell_idx} 时无法访问段落，跳过: {str(e)}")
                                        continue
@ -613,28 +718,81 @@ class DocumentService:
                    if field_code:
                        verification_placeholders.add(field_code)
            
-            for table in doc.tables:
+            for table_idx, table in enumerate(doc.tables):
                try:
                    if not table.rows:
                        continue
-                    for row in table.rows:
+                    
+                    # 安全地获取表格行数，使用索引方式访问行
+                    try:
+                        row_count = len(table.rows)
+                    except Exception:
+                        continue
+                    
+                    for row_idx in range(row_count):
                        try:
+                            # 使用索引访问行，而不是迭代器
+                            row = table.rows[row_idx]
+                            
                            if not hasattr(row, 'cells'):
                                continue
+                            
                            try:
-                                cells = row.cells
+                                # 先尝试获取cells的数量
+                                try:
+                                    cell_count = len(row.cells)
+                                except (IndexError, AttributeError):
+                                    cell_count = 0
+                                
+                                if cell_count == 0:
+                                    continue
+                                
+                                # 使用索引方式访问cells，而不是迭代器
+                                cells = []
+                                for cell_idx in range(cell_count):
+                                    try:
+                                        cell = row.cells[cell_idx]
+                                        cells.append(cell)
+                                    except (IndexError, AttributeError):
+                                        continue
+                                
+                                if not cells:
+                                    continue
                            except (IndexError, AttributeError):
                                continue
-                            for cell in cells:
+                            
+                            for cell_idx, cell in enumerate(cells):
                                try:
-                                    if hasattr(cell, 'paragraphs'):
-                                        for paragraph in cell.paragraphs:
+                                    if not hasattr(cell, 'paragraphs'):
+                                        continue
+                                    
+                                    # 安全地获取paragraphs列表
+                                    try:
+                                        try:
+                                            para_count = len(cell.paragraphs)
+                                        except (IndexError, AttributeError):
+                                            para_count = 0
+                                        
+                                        paragraphs = []
+                                        for para_idx in range(para_count):
+                                            try:
+                                                para = cell.paragraphs[para_idx]
+                                                paragraphs.append(para)
+                                            except (IndexError, AttributeError):
+                                                continue
+                                    except (IndexError, AttributeError):
+                                        continue
+                                    
+                                    for paragraph in paragraphs:
+                                        try:
                                            text = paragraph.text
                                            matches = placeholder_pattern.findall(text)
                                            for match in matches:
                                                field_code = match.strip()
                                                if field_code:
                                                    verification_placeholders.add(field_code)
+                                        except Exception:
+                                            continue
                                except Exception:
                                    continue
                        except Exception:
@ -688,13 +846,34 @@ class DocumentService:
                            except Exception:
                                continue
                            
-                            for row_idx, row in enumerate(table.rows):
+                            # 使用索引方式访问行，而不是迭代器
+                            for row_idx in range(row_count):
                                try:
+                                    # 使用索引访问行，而不是迭代器
+                                    row = table.rows[row_idx]
+                                    
                                    if not hasattr(row, 'cells'):
                                        continue
                                    
                                    try:
-                                        cells = row.cells
+                                        # 先尝试获取cells的数量
+                                        try:
+                                            cell_count = len(row.cells)
+                                        except (IndexError, AttributeError):
+                                            cell_count = 0
+                                        
+                                        if cell_count == 0:
+                                            continue
+                                        
+                                        # 使用索引方式访问cells，而不是迭代器
+                                        cells = []
+                                        for cell_idx in range(cell_count):
+                                            try:
+                                                cell = row.cells[cell_idx]
+                                                cells.append(cell)
+                                            except (IndexError, AttributeError):
+                                                continue
+                                        
                                        if not cells:
                                            continue
                                    except (IndexError, AttributeError):
@ -707,7 +886,19 @@ class DocumentService:
                                            
                                            # 安全地获取paragraphs列表
                                            try:
-                                                paragraphs = list(cell.paragraphs) if cell.paragraphs else []
+                                                # 先尝试获取paragraphs的数量
+                                                try:
+                                                    para_count = len(cell.paragraphs)
+                                                except (IndexError, AttributeError):
+                                                    para_count = 0
+                                                
+                                                paragraphs = []
+                                                for para_idx in range(para_count):
+                                                    try:
+                                                        para = cell.paragraphs[para_idx]
+                                                        paragraphs.append(para)
+                                                    except (IndexError, AttributeError):
+                                                        continue
                                            except (IndexError, AttributeError):
                                                continue