from docx import Document
import docx
import pandas as pd
import os
from docx.oxml.ns import qn
from PIL import Image, UnidentifiedImageError
import io
from typing import Optional
from modules.ocr import ReaderForEasyOCR
import asyncio
import logging
logger = logging.getLogger()

class WordParser:
    def __init__(self, use_ocr: bool, ocr_reader: Optional[ReaderForEasyOCR] = None):
        self.use_ocr = use_ocr
        self.ocr_reader = ocr_reader
    
    async def parse(self, contents: bytes, filename: str) -> list:
        doc = Document(io.BytesIO(contents))
        document_name = os.path.basename(filename)
        parsed_content = []
        page_number = 1

        text = ''
        for element in doc.element.body:
            if element.tag.endswith('p'):
                para = docx.text.paragraph.Paragraph(element, doc)
                text += await self.extract_paragraph_content(para) + '\n'
            elif element.tag.endswith('tbl'):
                table = docx.table.Table(element, doc)
                df = await self.unmerge_table(table)
                text += await self.dataframe_to_csv(df) + '\n'
        if text:
            entry = {
                    "document_id": f"{document_name}@{page_number:04}",
                    "text": text
                }
            parsed_content.append(entry)
        return parsed_content

    def extract_tables(self, file_path: str) -> list:
        doc = Document(file_path)
        tables = []
        for table in doc.tables:
            df = self.unmerge_table(table)
            tables.append(df)
        return tables

    async def extract_paragraph_content(self, para):
        content = ''
        for run in para.runs:
            if 'graphic' in run._element.xml:
                image_stream = run.part.related_parts[run._element.xpath(".//a:blip/@r:embed")[0]].blob
                try:
                    image = Image.open(io.BytesIO(image_stream))

                    # WMF 형식 건너뛰기
                    if image.format == "WMF":
                        continue

                    ocr_text = ""
                    width, height = image.size

                    # OCR 적용 조건: 150x150 픽셀 이상
                    if self.use_ocr and self.ocr_reader and (width >= 150 and height >= 150): 
                    
                        # 이미지 흑백 변환
                        image = image.convert('L')

                        ocr_results = await self.ocr_reader(image)
                        ocr_text = "\n".join([text for bbox, text in ocr_results])
                        ocr_text = f"(ocr)\n{ocr_text}\n(/ocr)"

                    content += f"(image)\n{ocr_text}\n(/image)"

                except UnidentifiedImageError:
                    logger.error(f"Unable to identify image format. Skipping this image.")
                    continue
            else:
                content += run.text
        return content

    async def extract_cell_content(self, cell):
        content = ''
        for element in cell._element:
            if element.tag.endswith('p'):
                para = docx.text.paragraph.Paragraph(element, cell)
                content += await self.extract_paragraph_content(para) + '\n'
            elif element.tag.endswith('tbl'):
                nested_table = docx.table.Table(element, cell._parent)
                nested_df = await self.unmerge_table(nested_table)
                content += await self.dataframe_to_csv(nested_df) + '\n'
            elif 'graphic' in element.xml:
                image_stream = element.part.related_parts[element.xpath(".//a:blip/@r:embed")[0]].blob
                try:
                    image = Image.open(io.BytesIO(image_stream))

                    # WMF 형식 건너뛰기
                    if image.format == "WMF":
                        continue

                    ocr_text = ""
                    width, height = image.size

                    # OCR 적용 조건: 150x150 픽셀 이상
                    if self.use_ocr and self.ocr_reader and (width >= 150 and height >= 150): 
                    
                        # 이미지 흑백 변환
                        image = image.convert('L')

                        ocr_results = await self.ocr_reader(image)
                        ocr_text = "\n".join([text for bbox, text in ocr_results])
                        ocr_text = f"(ocr)\n{ocr_text}\n(/ocr)"

                    content += f"(image)\n{ocr_text}\n(/image)\n"

                except UnidentifiedImageError:
                    logger.error(f"Unable to identify image format. Skipping this image.")
                    continue
        return content.strip()

    async def unmerge_table(self, table):
        rows = len(table.rows)
        cols = max(len(row.cells) for row in table.rows)
        data = [['' for _ in range(cols)] for _ in range(rows)]

        for row_idx, row in enumerate(table.rows):
            for col_idx, cell in enumerate(row.cells):
                cell_content = await self.extract_cell_content(cell)
                cell_span = cell._element
                grid_span = int(cell_span.get(qn('w:gridSpan'), 1))
                v_merge = cell_span.get(qn('w:vMerge'))
                if v_merge == 'restart':
                    for i in range(row_idx, rows):
                        if table.cell(i, col_idx).text.strip() == '':
                            data[i][col_idx] = cell_content
                        else:
                            break
                else:
                    for j in range(col_idx, col_idx + grid_span):
                        data[row_idx][j] = cell_content

        df = pd.DataFrame(data)
        df.columns = df.iloc[0]
        df = df.drop(0).reset_index(drop=True)
        return df

    async def dataframe_to_csv(self, df):
        csv_content = df.to_csv(index=False)
        return csv_content