import requests
import json
import base64
from PIL import Image
import io
from pdf2image import convert_from_path
def analyze_document_with_granite(json_path, pdf_path, output_file=None):
"""
Scans a converted PDF document using Granite Vision via Ollama.
Args:
json_path: Path to the Docling Generated JSON File
pdf_path: Path to the original PDF document
output_file: Path to save the scan (optional)
Returns:
Dictionary containing analysis results
"""
print(f"Document analysis with Granite Vision...")
# Function to send a request to Ollama
def query_ollama(prompt, images=None):
api_url = "http://localhost:11434/api/generate"
request_data = {
"model": "granite3.2-vision",
"prompt": prompt,
"stream": False
}
if images:
request_data["images"] = images
response = requests.post(api_url, json=request_data)
if response.status_code != 200:
raise Exception(f"Erreur API Ollama: {response.text}")
return response.json()["response"]
# Upload the JSON document generated by Docling
with open(json_path, "r", encoding="utf-8") as f:
doc_data = json.load(f)
# Extract text from document
doc_content = ""
for item in doc_data["texts"]:
# if item["type"] == "TextItem":
doc_content += item.get("text", "") + "\n"
# Limit the length for the initial scan
summary_content = doc_content[:2000] + "..." if len(doc_content) > 2000 else doc_content
# Textual classification of the document
classification_prompt = f"""
Analyze the following content from a PDF document and answer the questions:
{summary_content}
1. What is the type of document (technical report, scientific article, documentation, etc.)?
2. What are the main themes covered?
3. Who is this document intended for?
4. What is the general structure of the document?
"""
classification_result = query_ollama(classification_prompt)
print("Text classification complete")
# Visual analysis of the first page
try:
# Converting the first page to an image
images = convert_from_path(pdf_path, first_page=1, last_page=1)
first_page = images[0]
# Resize if necessary
max_size = (1200, 1600)
if first_page.width > max_size[0] or first_page.height > max_size[1]:
first_page.thumbnail(max_size, Image.LANCZOS)
# Conversion to base64
buffered = io. BytesIO()
first_page.save(buffered, format="PNG")
img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
# Visual analysis
visual_prompt = """
Analyzes the layout and visual structure of this document page:
1. How is the layout (columns, sections) organized?
2. What visuals are present (tables, figures, diagrams)?
3. How is the visual hierarchy (titles, subtitles) structured?
4. Are there any distinctive or special elements?
"""
visual_analysis = query_ollama(visual_prompt, [img_base64])
print("Visual Analysis Completed")
except Exception as e:
visual_analysis = f"Error during visual analysis: {str(e)}"
print(f"Warning: {visual_analysis}")
# Extract tables for specific analysis
tables = doc_data.get("tables", [])
html_output = ""
for table in tables:
data = table.get("data", {})
num_rows = data.get("num_rows", 0)
num_cols = data.get("num_cols", 0)
grid = data.get("grid", [])
# Initializing an empty grid for the final render
final_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
# Mark cells according to their positions to manage spans
for row in grid:
for cell in row:
r_start = cell["start_row_offset_idx"]
c_start = cell["start_col_offset_idx"]
rowspan = cell["row_span"]
colspan = cell["col_span"]
text = cell["text"].strip()
# Avoid overwriting cells that have already been filled in (duplicates in the data)
if final_grid[r_start][c_start] is None:
final_grid[r_start][c_start] = {
'text' means text,
"rowspan": rowspan,
"colspan": colspan,
"is_header": cell.get("column_header", False) or cell.get("row_header", False),
"row_section": cell.get("row_section", False)
}
# Mark the cells covered by the spans as occupied
for r in range(r_start, r_start + rowspan):
for c in range(c_start, c_start + colspan):
if r == r_start and c == c_start:
continue
final_grid[r][c] = "SPAN"
# HTML Generation
html_output += "<table border='1'>\n"
for row in final_grid:
html_output += "<tr> n"
for cell in row:
if cell is None or cell == "SPAN":
continue
tag = "th" if cell["is_header"] else "td"
rowspan = f" rowspan='{cell['rowspan']}'" if cell["rowspan"] > 1 else ""
colspan = f" colspan='{cell['colspan']}'" if cell["colspan"] > 1 else ""
section_class = " class='section'" if cell["row_section"] else ""
html_output += f" <{tag}{rowspan}{colspan}{section_class}>{cell['text']}</{tag}>\n"
html_output += "</tr>\n"
html_output += "</table>\n"
table_analysis = None
if tables:
table_prompt = f"""
The document contains {len(final_grid[0])} array(s). Here are the data in the first table:
{html_output}
Analyze this table and describe its content and purpose in the document.
"""
table_analysis = query_ollama(table_prompt)
print("Table Analysis Completed")
# Combine all results
final_analysis = {
"classification_textuelle": classification_result,
"analyse_structure_visuelle": visual_analysis,
"analyse_tableaux": table_analysis
}
# Save results if requested
if output_file:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(final_analysis, f, ensure_ascii=False, indent=2)
print(f"Scan results saved in: {output_file}")
return final_analysis
|