Skip to content

Commit aea0e28

Browse files
committed
pdf example
1 parent d36f60d commit aea0e28

5 files changed

Lines changed: 124 additions & 0 deletions

File tree

notebooks/03-PDF.ipynb

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "a4143f5d-dbf5-42f9-b569-1dfe541c3dae",
6+
"metadata": {},
7+
"source": [
8+
"## PDF example with HungarianMatcher"
9+
]
10+
},
11+
{
12+
"cell_type": "code",
13+
"execution_count": 8,
14+
"id": "332e812d-a458-4034-a9a8-386253fde1a1",
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"from structflo.cser.pipeline import ChemPipeline, HungarianMatcher\n",
19+
"\n",
20+
"pipeline = ChemPipeline(\n",
21+
" weights=\"../runs/labels_detect/yolo11l_panels/weights/best.pt\",\n",
22+
" tile=False,\n",
23+
" conf=0.20,\n",
24+
" matcher=HungarianMatcher(max_distance=500),\n",
25+
" )\n"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": 9,
31+
"id": "9a45cc21-035f-4ed2-91c6-99bb59166b3c",
32+
"metadata": {},
33+
"outputs": [
34+
{
35+
"name": "stdout",
36+
"output_type": "stream",
37+
"text": [
38+
"Page 1: 0 compound pairs\n",
39+
"Page 2: 4 compound pairs\n",
40+
" CHEMBL178234 C1=CC2=C(C=C1)N(C=C2C3=CCNCC3)S(=O)(=O)C4=CC=C(Cl)S4\n",
41+
" CHEMBL2139698 CCN(C1=CC=CC=C1)C(=O)NC2=CC=CC3=CN=CC=C32.[I-]\n",
42+
" CHEMBL1356585 C1=C(C=C(C(=C1)NC2=NC3=CC(=CC=C3O2)Cl)C(=O)O)Cl\n",
43+
" CHEMBL93928 CN1CC[C@@]23CCCCC3C1CC4=CC=C(C=C42)O\n",
44+
"Page 3: 6 compound pairs\n",
45+
" CHEMBL2139698 CCN(C1=CC=CC=C1)C(=O)NC2=CC=CC3=CN=CC=C32.[I-]\n",
46+
" CHEMBL358173 CCCCCN(CCCCC)C(=O)[C@H]1CCN([C@@H](C1)C(=O)NCCN(C)CC2=C(C=CC=C2)OC)C(=O)N(C3=CC=CC=C3)C4=CC=CC=C4.CCC\n",
47+
" CHEMBL174026 C[C@@H](C(=O)N[C@@H](C)C(=O)N[C@H](CC1=CNC2=C1C=CC=C2)C(=O)N[C@@H](CC3=CC=CC=C3)C(=O)N(C)C)NC(=O)C4=CC=CC=C4\n",
48+
" CHEMBL178234 C1=CC2=C(C=C1)N(C=C2C3=CCNCC3)S(=O)(=O)C4=CC=C(Cl)S4\n",
49+
" CHEMBL2332709 CN1CCN(CC1)CC(COC2=CC3=C(C=CC(=O)O3)C=C2OC)O\n",
50+
" CHEMBL1356585 C1=C(C=C(C(=C1)NC2=NC3=CC(=CC=C3O2)Cl)C(=O)O)Cl\n"
51+
]
52+
}
53+
],
54+
"source": [
55+
"all_pages = pipeline.process_pdf(\"notebook-data/example.pdf\")\n",
56+
"\n",
57+
"for page_num, pairs in enumerate(all_pages):\n",
58+
" print(f\"Page {page_num + 1}: {len(pairs)} compound pairs\")\n",
59+
" for pair in pairs:\n",
60+
" print(f\" {pair.label_text:20s} {pair.smiles}\")"
61+
]
62+
},
63+
{
64+
"cell_type": "code",
65+
"execution_count": 10,
66+
"id": "960c71fa-f45e-4b6e-93e4-8a9dfb8aa1f4",
67+
"metadata": {},
68+
"outputs": [
69+
{
70+
"data": {
71+
"text/plain": [
72+
"[[],\n",
73+
" [CompoundPair(structure=Detection(bbox=BBox(x1=387.1636047363281, y1=348.36968994140625, x2=613.1963500976562, y2=613.225830078125), conf=0.9752809405326843, class_id=0), label=Detection(bbox=BBox(x1=370.8852233886719, y1=675.03125, x2=553.0623779296875, y2=702.549560546875), conf=0.9521934390068054, class_id=1), match_distance=211.4725807905811, smiles='C1=CC2=C(C=C1)N(C=C2C3=CCNCC3)S(=O)(=O)C4=CC=C(Cl)S4', label_text='CHEMBL178234', match_confidence=None),\n",
74+
" CompoundPair(structure=Detection(bbox=BBox(x1=969.6254272460938, y1=348.4045715332031, x2=1215.8751220703125, y2=613.7841796875), conf=0.9751940965652466, class_id=0), label=Detection(bbox=BBox(x1=962.3702392578125, y1=674.9633178710938, x2=1158.2470703125, y2=702.4953002929688), conf=0.9447051882743835, class_id=1), match_distance=210.15404896831305, smiles='CCN(C1=CC=CC=C1)C(=O)NC2=CC=CC3=CN=CC=C32.[I-]', label_text='CHEMBL2139698', match_confidence=None),\n",
75+
" CompoundPair(structure=Detection(bbox=BBox(x1=665.7091064453125, y1=406.2452087402344, x2=927.4957885742188, y2=558.6116943359375), conf=0.9658717513084412, class_id=0), label=Detection(bbox=BBox(x1=666.3357543945312, y1=675.1514282226562, x2=862.4793701171875, y2=702.4734497070312), conf=0.9454785585403442, class_id=1), match_distance=208.88001556558808, smiles='C1=C(C=C(C(=C1)NC2=NC3=CC(=CC=C3O2)Cl)C(=O)O)Cl', label_text='CHEMBL1356585', match_confidence=None),\n",
76+
" CompoundPair(structure=Detection(bbox=BBox(x1=1260.3450927734375, y1=387.3260498046875, x2=1519.4493408203125, y2=573.7887573242188), conf=0.9371417760848999, class_id=0), label=Detection(bbox=BBox(x1=1258.1990966796875, y1=675.0844116210938, x2=1426.6123046875, y2=702.7105712890625), conf=0.9486707448959351, class_id=1), match_distance=213.6844316393017, smiles='CN1CC[C@@]23CCCCC3C1CC4=CC=C(C=C42)O', label_text='CHEMBL93928', match_confidence=None)],\n",
77+
" [CompoundPair(structure=Detection(bbox=BBox(x1=1688.369873046875, y1=34.59661865234375, x2=1934.6395263671875, y2=296.98468017578125), conf=0.9759939908981323, class_id=0), label=Detection(bbox=BBox(x1=1683.040771484375, y1=356.70135498046875, x2=1875.1815185546875, y2=382.9691162109375), conf=0.9374803304672241, class_id=1), match_distance=206.5999407921721, smiles='CCN(C1=CC=CC=C1)C(=O)NC2=CC=CC3=CN=CC=C32.[I-]', label_text='CHEMBL2139698', match_confidence=None),\n",
78+
" CompoundPair(structure=Detection(bbox=BBox(x1=1094.8671875, y1=629.5819091796875, x2=1360.641357421875, y2=890.28076171875), conf=0.9756079912185669, class_id=0), label=Detection(bbox=BBox(x1=1098.669189453125, y1=952.0912475585938, x2=1277.9361572265625, y2=978.4091796875), conf=0.9439839124679565, class_id=1), match_distance=209.0747962040637, smiles='CCCCCN(CCCCC)C(=O)[C@H]1CCN([C@@H](C1)C(=O)NCCN(C)CC2=C(C=CC=C2)OC)C(=O)N(C3=CC=CC=C3)C4=CC=CC=C4.CCC', label_text='CHEMBL358173', match_confidence=None),\n",
79+
" CompoundPair(structure=Detection(bbox=BBox(x1=1387.283447265625, y1=688.3386840820312, x2=1650.4847412109375, y2=834.5916748046875), conf=0.9742912650108337, class_id=0), label=Detection(bbox=BBox(x1=1390.84765625, y1=952.2308959960938, x2=1569.8570556640625, y2=978.0966796875), conf=0.9447979927062988, class_id=1), match_distance=207.31092088559816, smiles='C[C@@H](C(=O)N[C@@H](C)C(=O)N[C@H](CC1=CNC2=C1C=CC=C2)C(=O)N[C@@H](CC3=CC=CC=C3)C(=O)N(C)C)NC(=O)C4=CC=CC=C4', label_text='CHEMBL174026', match_confidence=None),\n",
80+
" CompoundPair(structure=Detection(bbox=BBox(x1=1113.9666748046875, y1=35.128562927246094, x2=1339.7755126953125, y2=296.4732971191406), conf=0.9735834002494812, class_id=0), label=Detection(bbox=BBox(x1=1098.7281494140625, y1=356.7406005859375, x2=1278.6048583984375, y2=382.8806457519531), conf=0.9379453659057617, class_id=1), match_distance=207.55612634310035, smiles='C1=CC2=C(C=C1)N(C=C2C3=CCNCC3)S(=O)(=O)C4=CC=C(Cl)S4', label_text='CHEMBL178234', match_confidence=None),\n",
81+
" CompoundPair(structure=Detection(bbox=BBox(x1=1680.935546875, y1=716.2552490234375, x2=1941.0592041015625, y2=805.8529663085938), conf=0.9634426832199097, class_id=0), label=Detection(bbox=BBox(x1=1683.184814453125, y1=952.0816040039062, x2=1875.502197265625, y2=978.3987426757812), conf=0.9430668354034424, class_id=1), match_distance=206.6250635277353, smiles='CN1CCN(CC1)CC(COC2=CC3=C(C=CC(=O)O3)C=C2OC)O', label_text='CHEMBL2332709', match_confidence=None),\n",
82+
" CompoundPair(structure=Detection(bbox=BBox(x1=1389.677490234375, y1=90.54808044433594, x2=1650.2188720703125, y2=241.5001983642578), conf=0.9584228992462158, class_id=0), label=Detection(bbox=BBox(x1=1390.898193359375, y1=356.7479553222656, x2=1583.382080078125, y2=382.7828063964844), conf=0.9401251673698425, class_id=1), match_distance=206.3658432231776, smiles='C1=C(C=C(C(=C1)NC2=NC3=CC(=CC=C3O2)Cl)C(=O)O)Cl', label_text='CHEMBL1356585', match_confidence=None)]]"
83+
]
84+
},
85+
"execution_count": 10,
86+
"metadata": {},
87+
"output_type": "execute_result"
88+
}
89+
],
90+
"source": [
91+
"pipeline.process_pdf(\"notebook-data/example.pdf\", output_pdf=\"notebook-data/example-annotated.pdf\")\n"
92+
]
93+
},
94+
{
95+
"cell_type": "code",
96+
"execution_count": null,
97+
"id": "a5a02db6-fc1d-46b8-92e7-47fd3972031f",
98+
"metadata": {},
99+
"outputs": [],
100+
"source": []
101+
}
102+
],
103+
"metadata": {
104+
"kernelspec": {
105+
"display_name": "Python 3 (ipykernel)",
106+
"language": "python",
107+
"name": "python3"
108+
},
109+
"language_info": {
110+
"codemirror_mode": {
111+
"name": "ipython",
112+
"version": 3
113+
},
114+
"file_extension": ".py",
115+
"mimetype": "text/x-python",
116+
"name": "python",
117+
"nbconvert_exporter": "python",
118+
"pygments_lexer": "ipython3",
119+
"version": "3.12.8"
120+
}
121+
},
122+
"nbformat": 4,
123+
"nbformat_minor": 5
124+
}
295 KB
Binary file not shown.
315 KB
Binary file not shown.
538 KB
Binary file not shown.
222 KB
Loading

0 commit comments

Comments
 (0)