ai-business-write/parse_excel_fields.py

"""
解析Excel文件，提取所有字段定义
"""
import openpyxl
from pathlib import Path
import json
from collections import defaultdict

# Excel文件路径
EXCEL_FILE = '技术文档/智慧监督项目模板数据结构设计表-20251125-一凡标注.xlsx'

def parse_excel():
    """解析Excel文件，提取字段信息"""
    print("="*60)
    print("开始解析Excel文件...")
    print("="*60)

    try:
        # 检查文件是否存在
        if not Path(EXCEL_FILE).exists():
            print(f"错误: Excel文件不存在: {EXCEL_FILE}")
            return None

        # 打开Excel文件
        workbook = openpyxl.load_workbook(EXCEL_FILE, data_only=True)
        print(f"✓ 成功打开Excel文件: {EXCEL_FILE}")

        # 获取工作表
        sheet = workbook.active

        # 读取所有数据
        all_rows = []
        for row in sheet.iter_rows(values_only=True):
            all_rows.append(row)

        print(f"总行数: {len(all_rows)}")

        if len(all_rows) < 2:
            print("错误: Excel文件数据不足")
            return None

        # 解析表头
        headers = all_rows[0]
        print(f"\n表头: {headers}")

        # 字段映射关系
        field_map = {
            '一级分类': 0,
            '二级分类': 1,
            '三级分类': 2,
            '输入数据字段': 3,
            '输出数据字段': 4,
            '输出示例数据': 5,
            '备注说明': 6
        }

        # 存储字段信息
        templates = defaultdict(lambda: {
            'template_name': '',
            'template_code': '',
            'input_fields': [],
            'output_fields': []
        })

        current_template = None
        current_input_field = None

        # 处理数据行（从第2行开始）
        for i, row in enumerate(all_rows[1:], 2):
            # 跳过空行
            if not any(row):
                continue

            level1 = row[0]  # 一级分类
            level2 = row[1]  # 二级分类
            level3 = row[2]  # 三级分类
            input_field = row[3]  # 输入数据字段
            output_field = row[4]  # 输出数据字段
            example = row[5]  # 输出示例数据
            remark = row[6]  # 备注说明

            # 如果有一级分类，说明是新的模板组
            if level1:
                print(f"\n处理一级分类: {level1}")

            # 如果有二级分类，说明是新的模板
            if level2:
                current_template = level2
                # 生成模板编码（将中文转换为大写英文，去掉空格）
                template_code = level2.upper().replace(' ', '_').replace('初步核实审批表', 'PRELIMINARY_VERIFICATION_APPROVAL').replace('请示报告卡', 'REPORT_CARD')
                templates[current_template]['template_name'] = current_template
                templates[current_template]['template_code'] = template_code
                print(f"  处理二级分类（模板）: {current_template} -> {template_code}")

            # 处理输入字段
            if input_field and input_field != current_input_field:
                current_input_field = input_field
                if current_template:
                    templates[current_template]['input_fields'].append({
                        'name': input_field,
                        'field_code': input_field.replace('线索信息', 'clue_info').replace('被核查人员工作基本情况线索', 'target_basic_info_clue')
                    })
                    print(f"    输入字段: {input_field}")

            # 处理输出字段
            if output_field:
                if current_template:
                    # 生成字段编码（简化版，实际需要更精确的映射）
                    field_code = output_field.lower().replace(' ', '_').replace('被核查人姓名', 'target_name').replace('被核查人员单位及职务', 'target_organization_and_position')

                    templates[current_template]['output_fields'].append({
                        'name': output_field,
                        'field_code': field_code,
                        'example': example,
                        'remark': remark
                    })
                    print(f"    输出字段: {output_field} -> {field_code}")

        workbook.close()

        # 转换为字典格式
        result = {}
        for template_name, template_info in templates.items():
            result[template_name] = template_info

        return result

    except Exception as e:
        print(f"解析Excel文件时发生错误: {e}")
        import traceback
        traceback.print_exc()
        return None

def generate_field_mapping():
    """生成字段映射关系（基于已知字段）"""
    # 根据初步核实审批表的字段定义，创建字段名称到字段编码的映射
    field_mapping = {
        '被核查人姓名': 'target_name',
        '被核查人员单位及职务': 'target_organization_and_position',
        '被核查人员性别': 'target_gender',
        '被核查人员出生年月': 'target_date_of_birth',
        '被核查人员政治面貌': 'target_political_status',
        '被核查人员职级': 'target_professional_rank',
        '线索来源': 'clue_source',
        '主要问题线索': 'target_issue_description',
        '初步核实审批表承办部门意见': 'department_opinion',
        '初步核实审批表填表人': 'filler_name',
        '线索信息': 'clue_info',
        '被核查人员工作基本情况线索': 'target_basic_info_clue',
    }
    return field_mapping

if __name__ == '__main__':
    result = parse_excel()
    if result:
        print("\n" + "="*60)
        print("解析结果汇总:")
        print("="*60)
        for template_name, template_info in result.items():
            print(f"\n模板: {template_name}")
            print(f"  模板编码: {template_info['template_code']}")
            print(f"  输入字段数: {len(template_info['input_fields'])}")
            print(f"  输出字段数: {len(template_info['output_fields'])}")

        # 保存为JSON文件
        output_file = 'parsed_fields.json'
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(result, f, ensure_ascii=False, indent=2)
        print(f"\n✓ 结果已保存到: {output_file}")