|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "id": "a4143f5d-dbf5-42f9-b569-1dfe541c3dae", |
| 6 | + "metadata": {}, |
| 7 | + "source": [ |
| 8 | + "## PDF example with HungarianMatcher" |
| 9 | + ] |
| 10 | + }, |
| 11 | + { |
| 12 | + "cell_type": "code", |
| 13 | + "execution_count": 8, |
| 14 | + "id": "332e812d-a458-4034-a9a8-386253fde1a1", |
| 15 | + "metadata": {}, |
| 16 | + "outputs": [], |
| 17 | + "source": [ |
| 18 | + "from structflo.cser.pipeline import ChemPipeline, HungarianMatcher\n", |
| 19 | + "\n", |
| 20 | + "pipeline = ChemPipeline(\n", |
| 21 | + " weights=\"../runs/labels_detect/yolo11l_panels/weights/best.pt\",\n", |
| 22 | + " tile=False,\n", |
| 23 | + " conf=0.20,\n", |
| 24 | + " matcher=HungarianMatcher(max_distance=500),\n", |
| 25 | + " )\n" |
| 26 | + ] |
| 27 | + }, |
| 28 | + { |
| 29 | + "cell_type": "code", |
| 30 | + "execution_count": 9, |
| 31 | + "id": "9a45cc21-035f-4ed2-91c6-99bb59166b3c", |
| 32 | + "metadata": {}, |
| 33 | + "outputs": [ |
| 34 | + { |
| 35 | + "name": "stdout", |
| 36 | + "output_type": "stream", |
| 37 | + "text": [ |
| 38 | + "Page 1: 0 compound pairs\n", |
| 39 | + "Page 2: 4 compound pairs\n", |
| 40 | + " CHEMBL178234 C1=CC2=C(C=C1)N(C=C2C3=CCNCC3)S(=O)(=O)C4=CC=C(Cl)S4\n", |
| 41 | + " CHEMBL2139698 CCN(C1=CC=CC=C1)C(=O)NC2=CC=CC3=CN=CC=C32.[I-]\n", |
| 42 | + " CHEMBL1356585 C1=C(C=C(C(=C1)NC2=NC3=CC(=CC=C3O2)Cl)C(=O)O)Cl\n", |
| 43 | + " CHEMBL93928 CN1CC[C@@]23CCCCC3C1CC4=CC=C(C=C42)O\n", |
| 44 | + "Page 3: 6 compound pairs\n", |
| 45 | + " CHEMBL2139698 CCN(C1=CC=CC=C1)C(=O)NC2=CC=CC3=CN=CC=C32.[I-]\n", |
| 46 | + " CHEMBL358173 CCCCCN(CCCCC)C(=O)[C@H]1CCN([C@@H](C1)C(=O)NCCN(C)CC2=C(C=CC=C2)OC)C(=O)N(C3=CC=CC=C3)C4=CC=CC=C4.CCC\n", |
| 47 | + " CHEMBL174026 C[C@@H](C(=O)N[C@@H](C)C(=O)N[C@H](CC1=CNC2=C1C=CC=C2)C(=O)N[C@@H](CC3=CC=CC=C3)C(=O)N(C)C)NC(=O)C4=CC=CC=C4\n", |
| 48 | + " CHEMBL178234 C1=CC2=C(C=C1)N(C=C2C3=CCNCC3)S(=O)(=O)C4=CC=C(Cl)S4\n", |
| 49 | + " CHEMBL2332709 CN1CCN(CC1)CC(COC2=CC3=C(C=CC(=O)O3)C=C2OC)O\n", |
| 50 | + " CHEMBL1356585 C1=C(C=C(C(=C1)NC2=NC3=CC(=CC=C3O2)Cl)C(=O)O)Cl\n" |
| 51 | + ] |
| 52 | + } |
| 53 | + ], |
| 54 | + "source": [ |
| 55 | + "all_pages = pipeline.process_pdf(\"notebook-data/example.pdf\")\n", |
| 56 | + "\n", |
| 57 | + "for page_num, pairs in enumerate(all_pages):\n", |
| 58 | + " print(f\"Page {page_num + 1}: {len(pairs)} compound pairs\")\n", |
| 59 | + " for pair in pairs:\n", |
| 60 | + " print(f\" {pair.label_text:20s} {pair.smiles}\")" |
| 61 | + ] |
| 62 | + }, |
| 63 | + { |
| 64 | + "cell_type": "code", |
| 65 | + "execution_count": 10, |
| 66 | + "id": "960c71fa-f45e-4b6e-93e4-8a9dfb8aa1f4", |
| 67 | + "metadata": {}, |
| 68 | + "outputs": [ |
| 69 | + { |
| 70 | + "data": { |
| 71 | + "text/plain": [ |
| 72 | + "[[],\n", |
| 73 | + " [CompoundPair(structure=Detection(bbox=BBox(x1=387.1636047363281, y1=348.36968994140625, x2=613.1963500976562, y2=613.225830078125), conf=0.9752809405326843, class_id=0), label=Detection(bbox=BBox(x1=370.8852233886719, y1=675.03125, x2=553.0623779296875, y2=702.549560546875), conf=0.9521934390068054, class_id=1), match_distance=211.4725807905811, smiles='C1=CC2=C(C=C1)N(C=C2C3=CCNCC3)S(=O)(=O)C4=CC=C(Cl)S4', label_text='CHEMBL178234', match_confidence=None),\n", |
| 74 | + " CompoundPair(structure=Detection(bbox=BBox(x1=969.6254272460938, y1=348.4045715332031, x2=1215.8751220703125, y2=613.7841796875), conf=0.9751940965652466, class_id=0), label=Detection(bbox=BBox(x1=962.3702392578125, y1=674.9633178710938, x2=1158.2470703125, y2=702.4953002929688), conf=0.9447051882743835, class_id=1), match_distance=210.15404896831305, smiles='CCN(C1=CC=CC=C1)C(=O)NC2=CC=CC3=CN=CC=C32.[I-]', label_text='CHEMBL2139698', match_confidence=None),\n", |
| 75 | + " CompoundPair(structure=Detection(bbox=BBox(x1=665.7091064453125, y1=406.2452087402344, x2=927.4957885742188, y2=558.6116943359375), conf=0.9658717513084412, class_id=0), label=Detection(bbox=BBox(x1=666.3357543945312, y1=675.1514282226562, x2=862.4793701171875, y2=702.4734497070312), conf=0.9454785585403442, class_id=1), match_distance=208.88001556558808, smiles='C1=C(C=C(C(=C1)NC2=NC3=CC(=CC=C3O2)Cl)C(=O)O)Cl', label_text='CHEMBL1356585', match_confidence=None),\n", |
| 76 | + " CompoundPair(structure=Detection(bbox=BBox(x1=1260.3450927734375, y1=387.3260498046875, x2=1519.4493408203125, y2=573.7887573242188), conf=0.9371417760848999, class_id=0), label=Detection(bbox=BBox(x1=1258.1990966796875, y1=675.0844116210938, x2=1426.6123046875, y2=702.7105712890625), conf=0.9486707448959351, class_id=1), match_distance=213.6844316393017, smiles='CN1CC[C@@]23CCCCC3C1CC4=CC=C(C=C42)O', label_text='CHEMBL93928', match_confidence=None)],\n", |
| 77 | + " [CompoundPair(structure=Detection(bbox=BBox(x1=1688.369873046875, y1=34.59661865234375, x2=1934.6395263671875, y2=296.98468017578125), conf=0.9759939908981323, class_id=0), label=Detection(bbox=BBox(x1=1683.040771484375, y1=356.70135498046875, x2=1875.1815185546875, y2=382.9691162109375), conf=0.9374803304672241, class_id=1), match_distance=206.5999407921721, smiles='CCN(C1=CC=CC=C1)C(=O)NC2=CC=CC3=CN=CC=C32.[I-]', label_text='CHEMBL2139698', match_confidence=None),\n", |
| 78 | + " CompoundPair(structure=Detection(bbox=BBox(x1=1094.8671875, y1=629.5819091796875, x2=1360.641357421875, y2=890.28076171875), conf=0.9756079912185669, class_id=0), label=Detection(bbox=BBox(x1=1098.669189453125, y1=952.0912475585938, x2=1277.9361572265625, y2=978.4091796875), conf=0.9439839124679565, class_id=1), match_distance=209.0747962040637, smiles='CCCCCN(CCCCC)C(=O)[C@H]1CCN([C@@H](C1)C(=O)NCCN(C)CC2=C(C=CC=C2)OC)C(=O)N(C3=CC=CC=C3)C4=CC=CC=C4.CCC', label_text='CHEMBL358173', match_confidence=None),\n", |
| 79 | + " CompoundPair(structure=Detection(bbox=BBox(x1=1387.283447265625, y1=688.3386840820312, x2=1650.4847412109375, y2=834.5916748046875), conf=0.9742912650108337, class_id=0), label=Detection(bbox=BBox(x1=1390.84765625, y1=952.2308959960938, x2=1569.8570556640625, y2=978.0966796875), conf=0.9447979927062988, class_id=1), match_distance=207.31092088559816, smiles='C[C@@H](C(=O)N[C@@H](C)C(=O)N[C@H](CC1=CNC2=C1C=CC=C2)C(=O)N[C@@H](CC3=CC=CC=C3)C(=O)N(C)C)NC(=O)C4=CC=CC=C4', label_text='CHEMBL174026', match_confidence=None),\n", |
| 80 | + " CompoundPair(structure=Detection(bbox=BBox(x1=1113.9666748046875, y1=35.128562927246094, x2=1339.7755126953125, y2=296.4732971191406), conf=0.9735834002494812, class_id=0), label=Detection(bbox=BBox(x1=1098.7281494140625, y1=356.7406005859375, x2=1278.6048583984375, y2=382.8806457519531), conf=0.9379453659057617, class_id=1), match_distance=207.55612634310035, smiles='C1=CC2=C(C=C1)N(C=C2C3=CCNCC3)S(=O)(=O)C4=CC=C(Cl)S4', label_text='CHEMBL178234', match_confidence=None),\n", |
| 81 | + " CompoundPair(structure=Detection(bbox=BBox(x1=1680.935546875, y1=716.2552490234375, x2=1941.0592041015625, y2=805.8529663085938), conf=0.9634426832199097, class_id=0), label=Detection(bbox=BBox(x1=1683.184814453125, y1=952.0816040039062, x2=1875.502197265625, y2=978.3987426757812), conf=0.9430668354034424, class_id=1), match_distance=206.6250635277353, smiles='CN1CCN(CC1)CC(COC2=CC3=C(C=CC(=O)O3)C=C2OC)O', label_text='CHEMBL2332709', match_confidence=None),\n", |
| 82 | + " CompoundPair(structure=Detection(bbox=BBox(x1=1389.677490234375, y1=90.54808044433594, x2=1650.2188720703125, y2=241.5001983642578), conf=0.9584228992462158, class_id=0), label=Detection(bbox=BBox(x1=1390.898193359375, y1=356.7479553222656, x2=1583.382080078125, y2=382.7828063964844), conf=0.9401251673698425, class_id=1), match_distance=206.3658432231776, smiles='C1=C(C=C(C(=C1)NC2=NC3=CC(=CC=C3O2)Cl)C(=O)O)Cl', label_text='CHEMBL1356585', match_confidence=None)]]" |
| 83 | + ] |
| 84 | + }, |
| 85 | + "execution_count": 10, |
| 86 | + "metadata": {}, |
| 87 | + "output_type": "execute_result" |
| 88 | + } |
| 89 | + ], |
| 90 | + "source": [ |
| 91 | + "pipeline.process_pdf(\"notebook-data/example.pdf\", output_pdf=\"notebook-data/example-annotated.pdf\")\n" |
| 92 | + ] |
| 93 | + }, |
| 94 | + { |
| 95 | + "cell_type": "code", |
| 96 | + "execution_count": null, |
| 97 | + "id": "a5a02db6-fc1d-46b8-92e7-47fd3972031f", |
| 98 | + "metadata": {}, |
| 99 | + "outputs": [], |
| 100 | + "source": [] |
| 101 | + } |
| 102 | + ], |
| 103 | + "metadata": { |
| 104 | + "kernelspec": { |
| 105 | + "display_name": "Python 3 (ipykernel)", |
| 106 | + "language": "python", |
| 107 | + "name": "python3" |
| 108 | + }, |
| 109 | + "language_info": { |
| 110 | + "codemirror_mode": { |
| 111 | + "name": "ipython", |
| 112 | + "version": 3 |
| 113 | + }, |
| 114 | + "file_extension": ".py", |
| 115 | + "mimetype": "text/x-python", |
| 116 | + "name": "python", |
| 117 | + "nbconvert_exporter": "python", |
| 118 | + "pygments_lexer": "ipython3", |
| 119 | + "version": "3.12.8" |
| 120 | + } |
| 121 | + }, |
| 122 | + "nbformat": 4, |
| 123 | + "nbformat_minor": 5 |
| 124 | +} |
0 commit comments