import pdfplumber import docx import csv import os from io import StringIO, BytesIO from PIL import Image import base64 def extract_text_tables(file_path: str) -> str: result = "" if file_path.endswith(".pdf"): with pdfplumber.open(file_path) as pdf: for page in pdf.pages: text = page.extract_text() if text: result += "
" + text.replace("\n", "
") + "
" tables = page.extract_tables() if tables: for table in tables: csv_output = StringIO() csv_writer = csv.writer(csv_output) csv_writer.writerows(table) result += f"{csv_output.getvalue()}"
# Извлечение изображений
if page.images:
for img in page.images:
img_data = img["stream"].get_data()
encoded_img = base64.b64encode(img_data).decode("utf-8")
result += f'{para.text}
") for table in doc.tables: csv_output = StringIO() csv_writer = csv.writer(csv_output) for row in table.rows: csv_writer.writerow([cell.text.strip() for cell in row.cells]) table_data.append(f"{csv_output.getvalue()}")
# Извлечение изображений
for rel in doc.part.rels:
if "image" in doc.part.rels[rel].target_ref:
image_data_blob = doc.part.rels[rel].target_part.blob
encoded_img = base64.b64encode(image_data_blob).decode("utf-8")
image_data.append(f'