Using Lift to Turn Research PDFs into Structured JSON with Controlled, Schema-Guided Field-Level Evaluation

Using Lift to Turn Research PDFs into Structured JSON with Controlled, Schema-Guided Field-Level Evaluation


def render_pdf(d, path):
   """Draw a realistic 3-page report. Page breaks are forced so the headline metric on
   page 1 (abstract) is physically separated from the results table on page 3."""
   from reportlab.lib.pagesizes import LETTER
   from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
   from reportlab.lib.units import inch
   from reportlab.lib import colors
   from reportlab.platypus import (SimpleDocTemplate, Paragraph, Spacer,
                                   Table, TableStyle, PageBreak)
   ss = getSampleStyleSheet()
   H1   = ParagraphStyle("H1", parent=ss["Title"], fontSize=16, leading=20, spaceAfter=6)
   AUTH = ParagraphStyle("AUTH", parent=ss["Normal"], fontSize=9.5, textColor=colors.grey, spaceAfter=10)
   H2   = ParagraphStyle("H2", parent=ss["Heading2"], fontSize=12, spaceBefore=8, spaceAfter=4)
   BODY = ParagraphStyle("BODY", parent=ss["Normal"], fontSize=10, leading=14, spaceAfter=6)
   sota_phrase = (f"surpassing the previous best of {d['prior_best']}"
                  if d["beats_sota"] else
                  f"approaching but not exceeding the previous best of {d['prior_best']}")
   authors_line = ", ".join(f"{n} ({a})" for (n, a) in d["authors"])
   story = []
   story += [Paragraph(d["title"], H1), Paragraph(authors_line, AUTH), Paragraph("Abstract", H2)]
   story += [Paragraph(
       f"We introduce {d['method']}, a model for {d['task']}. On the {d['primary_benchmark']} "
       f"benchmark, {d['method']} attains {d['test_acc']} {d['metric_name']} on the held-out "
       f"test set, {sota_phrase}. Our {d['params_m']}M-parameter model is evaluated across "
       f"{len(d['datasets'])} datasets ({', '.join(d['datasets'])}). "
       f"Extensive ablations confirm the contribution of each component.", BODY)]
   story += [Paragraph("Keywords", H2),
             Paragraph(f"{d['task']}; representation learning; {d['primary_benchmark']}", BODY),
             PageBreak()]
   story += [Paragraph("1  Method and Training Details", H2)]
   story += [Paragraph(
       f"{d['method']} is trained end-to-end with the {d['optimizer']} optimizer. "
       f"We tune on a validation split and report final numbers on the test split. "
       f"The full training configuration is summarized in Table 1.", BODY)]
   hp = [["Hyperparameter", "Value"],
         ["Optimizer", d["optimizer"]],
         ["Learning rate", str(d["lr"])],
         ["Batch size", str(d["batch"])],
         ["Epochs", str(d["epochs"])],
         ["Parameters", f"{d['params_m']}M"]]
   t1 = Table(hp, colWidths=[2.4 * inch, 2.0 * inch])
   t1.setStyle(TableStyle([
       ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#2b3a67")),
       ("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
       ("FONTSIZE", (0, 0), (-1, -1), 9.5),
       ("GRID", (0, 0), (-1, -1), 0.4, colors.grey),
       ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#eef1f8")]),
       ("LEFTPADDING", (0, 0), (-1, -1), 8), ("TOPPADDING", (0, 0), (-1, -1), 4),
       ("BOTTOMPADDING", (0, 0), (-1, -1), 4)]))
   story += [Spacer(1, 4), t1, Spacer(1, 6),
             Paragraph("Table 1. Training configuration.", BODY),
             Paragraph("2  Datasets", H2),
             Paragraph(
                 f"We evaluate on {', '.join(d['datasets'])}. {d['primary_benchmark']} is our "
                 f"primary benchmark; the remaining datasets are used for generalization "
                 f"studies.", BODY),
             PageBreak()]
   story += [Paragraph("3  Results", H2)]
   res = [["Method", f"Val. {d['metric_name']}", f"Test {d['metric_name']}"],
          [f"{d['baseline_name']} (baseline)", str(d["baseline_val"]), str(d["baseline_test"])],
          [f"{d['method']} (ours)", str(d["val_acc"]), str(d["test_acc"])]]
   t2 = Table(res, colWidths=[2.6 * inch, 1.7 * inch, 1.7 * inch])
   t2.setStyle(TableStyle([
       ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#7a2e2e")),
       ("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
       ("FONTSIZE", (0, 0), (-1, -1), 9.5),
       ("GRID", (0, 0), (-1, -1), 0.4, colors.grey),
       ("FONTNAME", (0, 2), (-1, 2), "Helvetica-Bold"),
       ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#f7eeee")]),
       ("LEFTPADDING", (0, 0), (-1, -1), 8), ("TOPPADDING", (0, 0), (-1, -1), 4),
       ("BOTTOMPADDING", (0, 0), (-1, -1), 4)]))
   story += [Spacer(1, 4), t2, Spacer(1, 6),
             Paragraph(f"Table 2. Results on {d['primary_benchmark']}. "
                       f"Best test result in bold.", BODY),
             Paragraph("4  Limitations", H2)]
   for lim in d["limitations"]:
       story += [Paragraph("• " + lim, BODY)]
   story += [Paragraph("5  Funding and Code Availability", H2),
             Paragraph(d["funding_note"], BODY)]
   SimpleDocTemplate(path, pagesize=LETTER,
                     topMargin=0.8 * inch, bottomMargin=0.8 * inch,
                     leftMargin=0.9 * inch, rightMargin=0.9 * inch).build(story)
print("STEP 3/7 · Generating synthetic report PDFs…")
CORPUS = []
for i, d in enumerate(DOCS):
   path = f"/content/report_{i}.pdf" if os.path.isdir("/content") else f"report_{i}.pdf"
   render_pdf(d, path)
   CORPUS.append((d, ground_truth(d), path))
   print(f"     ✓ {os.path.basename(path)}  —  {d['method']}")
print()
if SHOW_FIRST_PAGE:
   try:
       import pypdfium2 as pdfium, matplotlib.pyplot as plt
       pg  = pdfium.PdfDocument(CORPUS[0][2])[0]
       img = pg.render(scale=2.0).to_pil()
       plt.figure(figsize=(6.4, 8.3)); plt.imshow(img); plt.axis("off")
       plt.title("What lift reads — page 1 of report_0.pdf", fontsize=10); plt.show()
   except Exception as e:
       print("     (page preview skipped:", e, ")\n")



Source link