Commit 9a983009 authored by kihoon.lee's avatar kihoon.lee
Browse files


parent d65bdf42
# FileParser-custom
1. 도커 이미지 build
docker build -t fileparser-custom:latest .
2. 컨테이너 실행
- docker-compose 실행
``` bash
docker compose up -d
### 엔드포인트
1. `/upload`
### File을 태우면 본문 내용을 Str로 response 합니다.
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
import uvicorn
from parsers import FileParserFactory
app = FastAPI()"/upload")
async def upload_file(file: UploadFile = File(...)):
contents = await
parser = FileParserFactory(use_ocr=False)
parser = parser.get_parser(file.filename)
content = await parser.parse(contents, file.filename)
return JSONResponse(
content={"filename": file.filename, "content": content}
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=400)
if __name__ == "__main__":, host="", port=8080)
version: '3'
image: fileparser-custom:latest
container_name: fileparser-custom
restart: always
- 51033:80
- 51034:8080
TZ: Asia/Seoul
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04
RUN apt-get update && \
apt-get install -y --no-install-recommends \
python3.10 \
python3-pip \
openmpi-bin \
libopenmpi-dev \
python3-dev \
build-essential && \
rm -rf /var/lib/apt/lists/*
COPY ./requirements.txt /requirements.txt
RUN pip3 install -r requirements.txt
COPY ./ ./
CMD ["python3", ""]
from .reader import ReaderForEasyOCR
__all__ = [
\ No newline at end of file
from typing import List, Tuple
BBOX = Tuple[int, int, int, int]
CELL_TEXTS = List[List[List[Tuple[BBOX, str]]]]
from typing import List, Optional, Tuple, Union
import easyocr
import numpy as np
from PIL import Image
import asyncio
from concurrent.futures import ThreadPoolExecutor
from .io import BBOX
class ReaderForEasyOCR:
def __init__(self, languages: Optional[List[str]] = None, use_gpu: Optional[bool] = None, **kwargs):
self.reader = easyocr.Reader(languages, gpu=use_gpu, **kwargs)
self.executor = ThreadPoolExecutor()
async def __call__(self, inputs: Union[np.ndarray, Image.Image]) -> List[Tuple[BBOX, str]]:
if isinstance(inputs, Image.Image):
inputs = np.array(inputs)
loop = asyncio.get_running_loop()
outputs = await loop.run_in_executor(self.executor, self.reader.readtext, inputs)
outputs = [
((int(x1), int(y1), int(x2), int(y2)), text)
for ((x1, y1), _, (x2, y2), _), text, _ in outputs
outputs: List[Tuple[BBOX, str]] = sorted(outputs, key=lambda x: (x[0][1] // 30, x[0][0] // 30))
outputs = list(filter(lambda x: x[1].strip(), outputs))
return outputs
from .pdf_parser import PDFParser
from .ppt_parser import PPTParser
from .word_parser import WordParser
from .excel_parser import ExcelParser
from .text_parser import TextParser
from modules.ocr import ReaderForEasyOCR
import os
class FileParserFactory:
def __init__(self, use_ocr: bool, ocr_reader: ReaderForEasyOCR = None):
self.use_ocr = use_ocr
self.ocr_reader = ocr_reader
def get_parser(self, file_path: str):
file_extension = os.path.splitext(file_path)[1].lower() # 확장자만 소문자로 변환
if file_extension == '.pdf':
return PDFParser(use_ocr=self.use_ocr, ocr_reader=self.ocr_reader)
elif file_extension == '.pptx':
return PPTParser(use_ocr=self.use_ocr, ocr_reader=self.ocr_reader)
elif file_extension == '.docx':
return WordParser(use_ocr=self.use_ocr, ocr_reader=self.ocr_reader)
elif file_extension in ['.xlsx', '.xls', '.csv']:
return ExcelParser(use_ocr=self.use_ocr, ocr_reader=self.ocr_reader)
elif file_extension == '.txt':
return TextParser()
raise ValueError("Unsupported file format")
import pandas as pd
import logging
from openpyxl import load_workbook
from openpyxl.drawing.image import Image as OpenPyXLImage
from typing import Optional
from PIL import Image, UnidentifiedImageError
import io
import xlrd
from modules.ocr import ReaderForEasyOCR
import asyncio
import os
logger = logging.getLogger()
class ExcelParser:
def __init__(self, use_ocr: bool, ocr_reader: Optional[ReaderForEasyOCR] = None):
self.use_ocr = use_ocr
self.ocr_reader = ocr_reader
async def parse(self, contents: bytes, filename: str) -> list:
document_name = filename
parsed_content = []
file_extension = os.path.splitext(filename)[1].lower() # 확장자만 소문자로 변환
if file_extension == '.csv':
df = pd.read_csv(io.BytesIO(contents))
df_csv = df.to_csv(index=False, header=False)
sheet_name = 'Sheet1'
entry = {
"document_id": f"{document_name}@{sheet_name}",
"text": df_csv
elif file_extension == '.xls':
workbook = xlrd.open_workbook(io.BytesIO(contents), formatting_info=True)
for sheet in workbook.sheets():
data = [sheet.row_values(row) for row in range(sheet.nrows)]
df = pd.DataFrame(data)
df = self.fill_merged_cells_xls(df, sheet)
df_csv = df.to_csv(index=False, header=False)
sheet_name =
entry = {
"document_id": f"{document_name}@{sheet_name}",
"text": df_csv
elif file_extension == '.xlsx':
workbook = load_workbook(filename=io.BytesIO(contents), data_only=True)
for sheet_name in workbook.sheetnames:
sheet = workbook[sheet_name]
data = sheet.values
df = pd.DataFrame(data)
# 이미지 및 OCR 처리
for img in sheet._images:
# 이미지의 위치 계산
img_cell = img.anchor._from.row - 1, img.anchor._from.col - 1 # 이미지의 위치 (행, 열)
# DataFrame 크기 조정 (필요한 경우)
max_row, max_col = img_cell
if max_row >= len(df):
df = df.reindex(range(max_row + 1), fill_value='')
if max_col >= len(df.columns):
df = df.reindex(columns=range(max_col + 1), fill_value='')
img_data = img._data() # 이미지 데이터
img_obj =
# WMF 형식 처리 방지
if img_obj.format == "WMF":
logger.warning(f"Skipping WMF image in sheet {sheet_name} as it cannot be processed.")
ocr_text = ""
width, height = img_obj.size
if self.use_ocr and self.ocr_reader and (width >= 150 and height >= 150):
# 이미지를 흑백으로 변환
img_obj = img_obj.convert('L')
ocr_results = await self.ocr_reader(img_obj)
ocr_text = "\n".join([text for bbox, text in ocr_results])
ocr_text = f"(ocr)\n{ocr_text}\n(/ocr)"
# OCR 텍스트를 셀에 삽입
df.iat[img_cell[0], img_cell[1]] = f"(image)\n{ocr_text}\n(/image)"
#"Inserted OCR text at cell ({img_cell[0]}, {img_cell[1]}): {df.iat[img_cell[0], img_cell[1]]}")
except UnidentifiedImageError:
logger.error(f"Unable to identify image format in sheet {sheet_name}. Skipping this image.")
df = self.fill_merged_cells_xlsx(df, sheet)
df_csv = df.to_csv(index=False, header=False)
entry = {
"document_id": f"{document_name}@{sheet_name}",
"text": df_csv
raise ValueError("Unsupported file format")
return parsed_content
def fill_merged_cells_xlsx(self, df, sheet):
for merged_cell in sheet.merged_cells.ranges:
min_col, min_row, max_col, max_row = merged_cell.bounds
top_left_cell_value = sheet.cell(row=min_row, column=min_col).value
for row in range(min_row, max_row + 1):
for col in range(min_col, max_col + 1):
if pd.isna(df.iat[row - 1, col - 1]): # 기존 값이 비어 있는 경우에만 채우기
df.iat[row - 1, col - 1] = top_left_cell_value
return df
def fill_merged_cells_xls(self, df, sheet):
for merged_cell in sheet.merged_cells:
min_row, max_row, min_col, max_col = merged_cell
top_left_cell_value = sheet.cell_value(min_row, min_col)
for row in range(min_row, max_row):
for col in range(min_col, max_col):
if pd.isna(df.iat[row, col]) or df.iat[row, col] == '' or df.iat[row, col] is None: # 기존 값이 비어 있는 경우에만 채우기
df.iat[row, col] = top_left_cell_value
return df
import io
import os
import fitz # PyMuPDF
from PIL import Image, UnidentifiedImageError
import logging
import pandas as pd
from typing import Union, Optional, List, Tuple
from modules.ocr import ReaderForEasyOCR
import asyncio
logger = logging.getLogger()
def prepare_inputs(path_or_content: Union[str, bytes]) -> fitz.Document:
Prepare inputs for PyMuPDF
path_or_content: File path or content
PyMuPDF document object
if isinstance(path_or_content, str):
# 파일 경로를 직접 전달하여 호출
elif isinstance(path_or_content, bytes):
# 파일 내용을 바이트 형식으로 받아서 호출
return"pdf", path_or_content)
raise ValueError("Invalid input type")
class PDFParser:
def __init__(self, use_ocr: bool, ocr_reader: Optional[ReaderForEasyOCR] = None):
self.use_ocr = use_ocr
self.ocr_reader = ocr_reader
async def parse(self, file_path: Union[str, bytes], file_name: Optional[str] = None) -> str:
parsed_content = []
doc = prepare_inputs(file_path) # fitz.Document 객체 생성
name = file_name if file_name else "Unknown"
if file_name is not None:
name = file_name
elif isinstance(file_path, (str, os.PathLike)): # Union 안의 타입 검사 수정
name = os.path.basename(file_path)
for page_number in range(len(doc)):
elements = []
page = doc.load_page(page_number)
blocks = page.get_text("dict")["blocks"]
# PyMuPDF로 테이블 추출
tables = page.find_tables()
added_tables = set() # 추가된 테이블을 추적하기 위한 집합
table_areas = [table.bbox for table in tables] # 테이블 영역을 저장하는 리스트
# 이미지 추출
images = page.get_images(full=True)
for img_index, img in enumerate(images):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
img_ext = base_image["ext"]
image =
# WMF 형식 처리 방지
if image.format == "WMF":
logger.warning(f"Skipping WMF image on page {page_number + 1} as it cannot be processed.")
width, height = image.size
x0, y0, x1, y1 = img[3:7]
ocr_text = ""
# OCR 적용 조건: 150x150 픽셀 이상
if self.use_ocr and self.ocr_reader and (width >= 150 and height >= 150):
# 이미지 흑백 변환
image = image.convert('L')
# OCR 적용
ocr_results = await self.ocr_reader(image)
ocr_text = "\n".join([text for bbox, text in ocr_results])
ocr_text = f"(ocr)\n{ocr_text}\n(/ocr)"
image_text = f"(image)\n{ocr_text}\n(/image)"
elements.append((y0, 'image', image_text))
except UnidentifiedImageError:
logger.error(f"Unable to identify image format for an image on page {page_number + 1}. Skipping this image.")
for block in blocks:
x0, y0, x1, y1 = block['bbox']
text = " ".join([span["text"] for line in block.get("lines", []) for span in line["spans"]])
# is_table = False
for table_index, table_area in enumerate(table_areas):
table_x0, table_y0, table_x1, table_y1 = table_area
if (
x0 >= table_x0 and y0 >= table_y0 and
x1 <= table_x1 and y1 <= table_y1 # 테이블 범위 안의 데이터인 경우
if table_index not in added_tables: # 테이블이 아직 추가되지 않은 경우 추가
table_content = self.convert_table_to_csv(tables[table_index])
elements.append((table_y0, 'table', table_content))
elements.append((y0, 'text', text.strip()))
# 위치를 기준으로 요소 정렬
elements.sort(key=lambda x: x[0])
# 정렬된 요소들을 하나의 문자열로 결합
page_content = "\n\n".join(element[2] for element in elements)
entry = {
"document_id": f"{name}@{page_number + 1:04}",
"text": page_content
return parsed_content
def convert_table_to_csv(self, table):
"""Convert PyMuPDF table to CSV format"""
data = table.extract()
# DataFrame으로 변환하고 CSV 형식으로 변환
df = pd.DataFrame(data)
df = self.unmerge_cells(df)
#"==========CSV Table Info==========\n{df.to_csv(index=False, header=False)}")
return df.to_csv(index=False, header=False)
def convert_table_to_markdown(self, table):
"""Convert PyMuPDF table to markdown table"""
data = table.extract()
# DataFrame으로 변환하고 마크다운 형식으로 변환
df = pd.DataFrame(data[1:], columns=data[0])
df = self.unmerge_cells(df)
#"==========Markdown Table Info==========\n{df.to_markdown(index=False)}")
return df.to_markdown(index=False)
def unmerge_cells(self, df):
# TODO: 가로/세로 병합 예외 케이스 처리 필요
# Forward fill to handle vertical merges
df = df.ffill(axis=0).bfill(axis=0)
# Forward fill to handle horizontal merges
df = df.ffill(axis=1).bfill(axis=1)
return df
\ No newline at end of file
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE
import os
from typing import Optional
from modules.ocr import ReaderForEasyOCR
from PIL import Image, UnidentifiedImageError
import io
import asyncio
import logging
logger = logging.getLogger()
class PPTParser:
def __init__(self, use_ocr: bool, ocr_reader: Optional[ReaderForEasyOCR] = None):
self.use_ocr = use_ocr
self.ocr_reader = ocr_reader
async def parse(self, contents: bytes, filename: str) -> list:
# contents를 BytesIO 객체로 변환하여 사용
with io.BytesIO(contents) as temp_file:
prs = Presentation(temp_file)
document_name = os.path.basename(filename)
parsed_content = []
for slide_idx, slide in enumerate(prs.slides):
slide_identifier = f"{document_name}@{slide_idx + 1:04}"
slide_text = await self.process_slide(slide) # 비동기 호출
notes_text = self.extract_notes(slide)
# 슬라이드의 모든 텍스트와 노트를 하나의 항목으로 결합
full_text = slide_text
if notes_text.strip():
full_text += "\n\n[Notes]\n" + notes_text
if full_text.strip():
slide_entry = {
"document_id": slide_identifier,
"text": full_text
return parsed_content
async def extract_text_from_shape(self, shape):
text_runs = []
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
text_runs.append(await self.extract_text_from_group(shape)) # 비동기 호출
elif shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
image_stream = shape.image.blob
image =
if image.format == "WMF":
logger.warning(f"Skipping WMF image in shape {shape} as it cannot be processed.")
return "" # WMF 이미지는 건너뛰기
ocr_text = ""
width, height = image.size
# OCR 적용 조건: 150x150 픽셀 이상
if self.use_ocr and self.ocr_reader and (width >= 150 and height >= 150):
# 이미지 흑백 변환
image = image.convert('L')
ocr_results = await self.ocr_reader(image) # 비동기 호출
ocr_text = "\n".join([text for bbox, text in ocr_results])
ocr_text = f"(ocr)\n{ocr_text}\n(/ocr)"
except UnidentifiedImageError:
logger.error(f"Unable to identify image format for shape {shape}. Skipping this image.")
return ""
elif shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
paragraph_text = ""
for run in paragraph.runs:
paragraph_text += run.text
return '\n'.join(text_runs)
async def extract_text_from_group(self, group):
text_runs = []
shapes_sorted = sorted(group.shapes, key=lambda shape: (, shape.left))
for shape in shapes_sorted:
text_runs.append(await self.extract_text_from_shape(shape)) # 비동기 호출
return '\n'.join(text_runs)
async def extract_text_from_slide(self, slide):
grouped_texts = []
ungrouped_texts = []
shapes_sorted = sorted(slide.shapes, key=lambda shape: (, shape.left))
for shape in shapes_sorted:
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
grouped_texts.append(await self.extract_text_from_group(shape)) # 비동기 호출
ungrouped_texts.append(await self.extract_text_from_shape(shape)) # 비동기 호출
return "\n".join(grouped_texts + ungrouped_texts)
def extract_and_split_table(self, slide):
tables = []
for shape in slide.shapes:
if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
table = shape.table
table_data = []
for row in range(len(table.rows)):
row_data = []
for col in range(len(table.columns)):
cell = table.cell(row, col)
if cell.is_merge_origin:
text = cell.text
span_height = cell.span_height
span_width = cell.span_width
cell.split() # 병합 해제
for i in range(span_height):
for j in range(span_width):
table.cell(row + i, col + j).text = text
return tables
def table_to_csv(self, table):
csv_content = ""
for row in table:
csv_content += ", ".join(row) + "\n"
return csv_content
async def process_slide(self, slide):
slide_text_sections = await self.extract_text_from_slide(slide) # 비동기 호출
tables = self.extract_and_split_table(slide)
full_text = slide_text_sections + "\n"
for table in tables:
full_text += self.table_to_csv(table) + "\n"
return full_text
def extract_notes(self, slide):
if slide.has_notes_slide:
notes_slide = slide.notes_slide
notes_text = notes_slide.notes_text_frame.text
return notes_text
return ""
