merge_results.py

import os
import pandas as pd
from openpyxl.styles import Font, Alignment
from openpyxl.utils import get_column_letter

def collect_combined_scores(evaluated_dir):
    """
    evaluated_dir 내의 모든 combined_scores.xlsx 파일을 검색하여 리스트로 반환합니다.
    """
    combined_scores_files = []
    for root, dirs, files in os.walk(evaluated_dir):
        for file in files:
            if file == 'combined_scores.xlsx':
                combined_scores_files.append(os.path.join(root, file))
    return combined_scores_files

def get_categories_from_file(file_path):
    """
    주어진 엑셀 파일에서 카테고리(인덱스)를 추출합니다.
    """
    try:
        df = pd.read_excel(file_path, index_col=0)
        return df.index.tolist()
    except Exception as e:
        print(f"Error reading categories from {file_path}: {e}")
        return []

def extract_model_name(file_path):
    """
    파일 경로에서 모델 이름을 추출합니다.
    예: /path/to/model1/combined_scores.xlsx -> model1
    """
    return os.path.basename(os.path.dirname(file_path))

def drop_empty_string_columns(sheet_df):
    """
    각 컬럼의 문자열 길이를 기준으로 모든 값의 길이가 0인 컬럼을 삭제합니다.
    """
    for column in sheet_df.columns:
        # 각 셀의 문자열 길이를 측정 (NaN은 무시)
        string_lengths = sheet_df[column].dropna().astype(str).apply(len)
        
        # 모든 값의 문자열 길이가 0인 컬럼 삭제
        if string_lengths.sum() == 0:
            sheet_df.drop(columns=[column], inplace=True)
            
    return sheet_df

def create_score_sheets(combined_scores_files, categories, score_types):
    """
    각 스코어 유형별로 데이터프레임을 생성하여 딕셔너리에 저장합니다.
    """
    # 초기화: 각 스코어 유형에 대해 빈 데이터프레임 생성
    score_sheets = {score_type: pd.DataFrame(columns=categories) for score_type in score_types}
    
    for file in combined_scores_files:
        model_name = extract_model_name(file)
        try:
            df = pd.read_excel(file, index_col=0)  # 첫 번째 열을 인덱스로 설정 (카테고리)
        except Exception as e:
            print(f"Error reading {file}: {e}")
            continue
        
        for score_type in score_types:
            if score_type in df.columns:
                # 시트별 데이터프레임에 모델 이름을 인덱스로 추가하고 스코어를 행으로 추가
                # 존재하지 않는 카테고리는 NaN으로 채워짐
                score_sheets[score_type].loc[model_name] = df.loc[categories, score_type]
            else:
                print(f"Warning: '{score_type}' not found in {file}")
    
    # 각 시트에서 문자열 길이를 기준으로 빈 컬럼 삭제 및 'Score Average' 열 추가
    for score_type, sheet_df in score_sheets.items():
        # 1. 문자열 길이 기준으로 빈 컬럼 삭제
        drop_empty_string_columns(sheet_df)
        
        # 2. 기존 'Score Average' 열이 있을 경우 삭제
        if 'Score Average' in sheet_df.columns:
            sheet_df.drop(columns=['Score Average'], inplace=True)
        
        # 3. 각 모델별 평균값 계산
        sheet_df['Score Average'] = sheet_df.mean(axis=1, skipna=True)
        
        # 데이터프레임을 업데이트
        score_sheets[score_type] = sheet_df
    
    return score_sheets

def save_to_excel(score_sheets, output_file):
    """
    딕셔너리에 저장된 데이터프레임을 각 시트로 저장하여 엑셀 파일로 저장합니다.
    """
    with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
        for sheet_name, df in score_sheets.items():
            df.to_excel(writer, sheet_name=sheet_name)
            
            # 스타일 적용
            workbook = writer.book
            worksheet = writer.sheets[sheet_name]
            
            # 헤더 스타일
            header_font = Font(bold=True)
            for col_num, col in enumerate(df.columns, 1):
                cell = worksheet.cell(row=1, column=col_num + 1)
                cell.font = header_font
                cell.alignment = Alignment(horizontal='center', vertical='center')
            
            # 인덱스 스타일
            for row_num in range(2, len(df) + 2):
                cell = worksheet.cell(row=row_num, column=1)
                cell.font = header_font
                cell.alignment = Alignment(horizontal='center', vertical='center')
            
            # 열 너비 자동 조정
            for col_num, col in enumerate(df.columns, 1):
                max_length = max(df[col].astype(str).map(len).max(), len(col)) + 2
                worksheet.column_dimensions[get_column_letter(col_num + 1)].width = max_length
            
            # 인덱스 열 너비 조정
            max_length = max(df.index.astype(str).map(len).max(), len(df.index.name) if df.index.name else 0) + 2
            worksheet.column_dimensions[get_column_letter(1)].width = max_length
    
    print(f"모든 스코어가 '{output_file}' 파일에 저장되었습니다.")

def main():
    evaluated_dir = 'evaluated'  # 'evaluated' 디렉토리 경로
    output_file = 'merged_scores.xlsx'  # 출력 엑셀 파일 이름
    
    # 1. 'evaluated' 디렉토리 내 모든 combined_scores.xlsx 파일 검색
    combined_scores_files = collect_combined_scores(evaluated_dir)
    if not combined_scores_files:
        print("No 'combined_scores.xlsx' files found in the 'evaluated' directory.")
        return
    
    print(f"Found {len(combined_scores_files)} 'combined_scores.xlsx' files.")
    
    # 2. 첫 번째 파일에서 카테고리 추출
    categories = get_categories_from_file(combined_scores_files[0])
    if not categories:
        print("Failed to extract categories from the first file.")
        return
    
    # 3. 스코어 유형 정의
    score_types = [
        'cot_1_shot_single_score',
        'cot_1_shot_multi_score',
        '1_shot_single_score',
        '1_shot_multi_score',
        'default_single_score',
        'default_multi_score',
        'lotte_single_turn'
    ]
    
    # 4. 데이터 수집 및 시트 생성
    score_sheets = create_score_sheets(combined_scores_files, categories, score_types)
    
    # 5. 엑셀 파일로 저장
    save_to_excel(score_sheets, output_file)

if __name__ == "__main__":
    main()