From 3a249bdfe070a188cf17ad9d029b1aa4a519646a Mon Sep 17 00:00:00 2001
From: Mathias Nielsen <mathiasesn1@gmail.com>
Date: Fri, 22 Nov 2024 20:31:09 +0100
Subject: [PATCH 1/9] feat: remove yolov5 submodule and related configuration

---
 .gitmodules | 3 ---
 yolov5      | 1 -
 2 files changed, 4 deletions(-)
 delete mode 100644 .gitmodules
 delete mode 160000 yolov5

diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index e6a4dde..0000000
--- a/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "yolov5"]
-	path = yolov5
-	url = https://github.com/ultralytics/yolov5
diff --git a/yolov5 b/yolov5
deleted file mode 160000
index aa18599..0000000
--- a/yolov5
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit aa1859909c96d5e1fc839b2746b45038ee8465c9

From c888fb524b2fa2fca69af65d9a2d3cbc13ffcdf9 Mon Sep 17 00:00:00 2001
From: Mathias Nielsen <mathiasesn1@gmail.com>
Date: Fri, 22 Nov 2024 20:44:44 +0100
Subject: [PATCH 2/9] refactor: rename infer.py to pose/infer.py and
 restructure code; add pyproject.toml for packaging

---
 infer.py => pose/infer.py | 137 +++++++++++++++++++++++++-------------
 pyproject.toml            |  20 ++++++
 requirements.txt          |   3 -
 3 files changed, 110 insertions(+), 50 deletions(-)
 rename infer.py => pose/infer.py (55%)
 create mode 100644 pyproject.toml
 delete mode 100644 requirements.txt

diff --git a/infer.py b/pose/infer.py
similarity index 55%
rename from infer.py
rename to pose/infer.py
index adf15a4..b6c3cc0 100644
--- a/infer.py
+++ b/pose/infer.py
@@ -1,53 +1,75 @@
-import torch
+from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
+from pathlib import Path
+
 import cv2
-import argparse
 import numpy as np
-from tqdm import tqdm
-from pathlib import Path
+import torch
+from ultralytics import YOLO
 from torchvision import transforms as T
+from tqdm import tqdm
 
 from pose.models import get_pose_model
-from pose.utils.boxes import letterbox, scale_boxes, non_max_suppression, xyxy2xywh
+from pose.utils.boxes import letterbox, non_max_suppression, scale_boxes, xyxy2xywh
 from pose.utils.decode import get_final_preds, get_simdr_final_preds
-from pose.utils.utils import setup_cudnn, get_affine_transform, draw_keypoints
-from pose.utils.utils import VideoReader, VideoWriter, WebcamStream, FPS
-
-import sys
-sys.path.insert(0, 'yolov5')
-from yolov5.models.experimental import attempt_load
+from pose.utils.utils import (
+    FPS,
+    VideoReader,
+    VideoWriter,
+    WebcamStream,
+    draw_keypoints,
+    get_affine_transform,
+    setup_cudnn,
+)
 
 
 class Pose:
-    def __init__(self, 
+    def __init__(
+        self,
         det_model,
         pose_model,
         img_size=640,
         conf_thres=0.25,
-        iou_thres=0.45, 
+        iou_thres=0.45,
     ) -> None:
         self.img_size = img_size
         self.conf_thres = conf_thres
         self.iou_thres = iou_thres
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.det_model = attempt_load(det_model, map_location=self.device)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.det_model = YOLO(det_model)
         self.det_model = self.det_model.to(self.device)
 
         self.model_name = pose_model
         self.pose_model = get_pose_model(pose_model)
-        self.pose_model.load_state_dict(torch.load(pose_model, map_location='cpu'))
+        self.pose_model.load_state_dict(torch.load(pose_model, map_location="cpu"))
         self.pose_model = self.pose_model.to(self.device)
         self.pose_model.eval()
 
         self.patch_size = (192, 256)
 
-        self.pose_transform = T.Compose([
-            T.ToTensor(),
-            T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
-        ])
+        self.pose_transform = T.Compose(
+            [T.ToTensor(), T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]
+        )
 
         self.coco_skeletons = [
-            [16,14],[14,12],[17,15],[15,13],[12,13],[6,12],[7,13], [6,7],[6,8],
-            [7,9],[8,10],[9,11],[2,3],[1,2],[1,3],[2,4],[3,5],[4,6],[5,7]
+            [16, 14],
+            [14, 12],
+            [17, 15],
+            [15, 13],
+            [12, 13],
+            [6, 12],
+            [7, 13],
+            [6, 7],
+            [6, 8],
+            [7, 9],
+            [8, 10],
+            [9, 11],
+            [2, 3],
+            [1, 2],
+            [1, 3],
+            [2, 4],
+            [3, 5],
+            [4, 6],
+            [5, 7],
         ]
 
     def preprocess(self, image):
@@ -64,15 +86,19 @@ def box_to_center_scale(self, boxes, pixel_std=200):
         mask = boxes[:, 2] > boxes[:, 3] * r
         boxes[mask, 3] = boxes[mask, 2] / r
         boxes[~mask, 2] = boxes[~mask, 3] * r
-        boxes[:, 2:] /= pixel_std 
+        boxes[:, 2:] /= pixel_std
         boxes[:, 2:] *= 1.25
         return boxes
 
     def predict_poses(self, boxes, img):
         image_patches = []
         for cx, cy, w, h in boxes:
-            trans = get_affine_transform(np.array([cx, cy]), np.array([w, h]), self.patch_size)
-            img_patch = cv2.warpAffine(img, trans, self.patch_size, flags=cv2.INTER_LINEAR)
+            trans = get_affine_transform(
+                np.array([cx, cy]), np.array([w, h]), self.patch_size
+            )
+            img_patch = cv2.warpAffine(
+                img, trans, self.patch_size, flags=cv2.INTER_LINEAR
+            )
             img_patch = self.pose_transform(img_patch)
             image_patches.append(img_patch)
 
@@ -88,7 +114,7 @@ def postprocess(self, pred, img1, img0):
                 boxes = self.box_to_center_scale(boxes)
                 outputs = self.predict_poses(boxes, img0)
 
-                if 'simdr' in self.model_name:
+                if "simdr" in self.model_name:
                     coords = get_simdr_final_preds(*outputs, boxes, self.patch_size)
                 else:
                     coords = get_final_preds(outputs, boxes)
@@ -98,40 +124,48 @@ def postprocess(self, pred, img1, img0):
     @torch.no_grad()
     def predict(self, image):
         img = self.preprocess(image)
-        pred = self.det_model(img)[0]  
+        pred = self.det_model(img)[0]
         self.postprocess(pred, img, image)
         return image
 
 
 def argument_parser():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--source', type=str, default='assests/test.jpg')
-    parser.add_argument('--det-model', type=str, default='checkpoints/crowdhuman_yolov5m.pt')
-    parser.add_argument('--pose-model', type=str, default='checkpoints/pretrained/simdr_hrnet_w32_256x192.pth')
-    parser.add_argument('--img-size', type=int, default=640)
-    parser.add_argument('--conf-thres', type=float, default=0.4)
-    parser.add_argument('--iou-thres', type=float, default=0.5)
+    parser = ArgumentParser(
+        description="Pose Estimation",
+        formatter_class=ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("--source", type=str, default="assests/test.jpg")
+    parser.add_argument(
+        "--det-model", type=str, default="checkpoints/crowdhuman_yolov5m.pt"
+    )
+    parser.add_argument(
+        "--pose-model",
+        type=str,
+        default="checkpoints/pretrained/simdr_hrnet_w32_256x192.pth",
+    )
+    parser.add_argument("--img-size", type=int, default=640)
+    parser.add_argument("--conf-thres", type=float, default=0.4)
+    parser.add_argument("--iou-thres", type=float, default=0.5)
     return parser.parse_args()
 
 
-if __name__ == '__main__':
+def main():
     setup_cudnn()
     args = argument_parser()
     pose = Pose(
-        args.det_model,
-        args.pose_model,
-        args.img_size,
-        args.conf_thres,
-        args.iou_thres
+        args.det_model, args.pose_model, args.img_size, args.conf_thres, args.iou_thres
     )
 
     source = Path(args.source)
 
-    if source.is_file() and source.suffix in ['.jpg', '.png']:
+    if source.is_file() and source.suffix in [".jpg", ".png"]:
         image = cv2.imread(str(source))
         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
         output = pose.predict(image)
-        cv2.imwrite(f"{str(source).rsplit('.', maxsplit=1)[0]}_out.jpg", cv2.cvtColor(output, cv2.COLOR_RGB2BGR))
+        cv2.imwrite(
+            f"{str(source).rsplit('.', maxsplit=1)[0]}_out.jpg",
+            cv2.cvtColor(output, cv2.COLOR_RGB2BGR),
+        )
 
     elif source.is_dir():
         files = source.glob("*.jpg")
@@ -139,11 +173,16 @@ def argument_parser():
             image = cv2.imread(str(file))
             image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
             output = pose.predict(image)
-            cv2.imwrite(f"{str(file).rsplit('.', maxsplit=1)[0]}_out.jpg", cv2.cvtColor(output, cv2.COLOR_RGB2BGR))
+            cv2.imwrite(
+                f"{str(file).rsplit('.', maxsplit=1)[0]}_out.jpg",
+                cv2.cvtColor(output, cv2.COLOR_RGB2BGR),
+            )
 
-    elif source.is_file() and source.suffix in ['.mp4', '.avi']:
+    elif source.is_file() and source.suffix in [".mp4", ".avi"]:
         reader = VideoReader(args.source)
-        writer = VideoWriter(f"{args.source.rsplit('.', maxsplit=1)[0]}_out.mp4", reader.fps)
+        writer = VideoWriter(
+            f"{args.source.rsplit('.', maxsplit=1)[0]}_out.mp4", reader.fps
+        )
         fps = FPS(len(reader.frames))
 
         for frame in tqdm(reader):
@@ -151,7 +190,7 @@ def argument_parser():
             output = pose.predict(frame.numpy())
             fps.stop(False)
             writer.update(output)
-        
+
         print(f"FPS: {fps.fps}")
         writer.write()
 
@@ -163,4 +202,8 @@ def argument_parser():
             fps.start()
             output = pose.predict(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
             fps.stop()
-            cv2.imshow('frame', cv2.cvtColor(output, cv2.COLOR_RGB2BGR))
\ No newline at end of file
+            cv2.imwrite("frame.jpg", cv2.cvtColor(output, cv2.COLOR_RGB2BGR))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..b0ed1b8
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,20 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "pose-estimation"
+version = "0.1.0"
+description = "Top-Down Multi-person Pose Estimation"
+readme = "README.md"
+requires-python = ">=3.10.12"
+dependencies = [
+    "numpy>=2.1.3",
+    "opencv-python-headless>=4.10.0.84",
+    "tqdm>=4.67.0",
+    "ultralytics>=8.3.23",
+]
+scripts = { pose = "pose.infer:main" }
+
+[tool.hatch.build.targets.wheel]
+packages = ["pose"]
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 7b32f69..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-opencv-python
-numpy
-tqdm

From eff98eedc75679b18168e3132bfdaebbcbba7690 Mon Sep 17 00:00:00 2001
From: Mathias Nielsen <mathiasemil@mediacatch.io>
Date: Fri, 22 Nov 2024 20:53:00 +0100
Subject: [PATCH 3/9] Update pyproject.toml

---
 pyproject.toml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b0ed1b8..76f0150 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,10 +9,10 @@ description = "Top-Down Multi-person Pose Estimation"
 readme = "README.md"
 requires-python = ">=3.10.12"
 dependencies = [
-    "numpy>=2.1.3",
-    "opencv-python-headless>=4.10.0.84",
-    "tqdm>=4.67.0",
-    "ultralytics>=8.3.23",
+    "numpy~=1.26.4",
+    "opencv-python-headless~=4.10.0.84",
+    "tqdm~=4.67.0",
+    "ultralytics~=8.3.23",
 ]
 scripts = { pose = "pose.infer:main" }
 

From b0f71dec7b15be5bab3ad681ebf47694d0e41567 Mon Sep 17 00:00:00 2001
From: Mathias Nielsen <mathiasesn1@gmail.com>
Date: Fri, 22 Nov 2024 20:53:51 +0100
Subject: [PATCH 4/9] fix: update dependency versions in pyproject.toml for
 compatibility

---
 pyproject.toml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b0ed1b8..76f0150 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,10 +9,10 @@ description = "Top-Down Multi-person Pose Estimation"
 readme = "README.md"
 requires-python = ">=3.10.12"
 dependencies = [
-    "numpy>=2.1.3",
-    "opencv-python-headless>=4.10.0.84",
-    "tqdm>=4.67.0",
-    "ultralytics>=8.3.23",
+    "numpy~=1.26.4",
+    "opencv-python-headless~=4.10.0.84",
+    "tqdm~=4.67.0",
+    "ultralytics~=8.3.23",
 ]
 scripts = { pose = "pose.infer:main" }
 

From 98aed058370b38943d75929c519b77031ff5af3e Mon Sep 17 00:00:00 2001
From: Mathias Nielsen <mathiasesn1@gmail.com>
Date: Wed, 27 Nov 2024 20:44:32 +0100
Subject: [PATCH 5/9] fix: update detection model loading logic and change to
 inference mode

---
 pose/infer.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/pose/infer.py b/pose/infer.py
index b6c3cc0..76b10aa 100644
--- a/pose/infer.py
+++ b/pose/infer.py
@@ -35,8 +35,15 @@ def __init__(
         self.conf_thres = conf_thres
         self.iou_thres = iou_thres
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.det_model = YOLO(det_model)
-        self.det_model = self.det_model.to(self.device)
+
+        if "yolov5" in det_model:
+            self.det_model = torch.hub.load(
+                "ultralytics/yolov5", "custom", path=det_model, force_reload=True
+            )
+            self.det_model = self.det_model.to(self.device)
+        else:
+            self.det_model = YOLO(det_model)
+            self.det_model = self.det_model.to(self.device)
 
         self.model_name = pose_model
         self.pose_model = get_pose_model(pose_model)
@@ -121,7 +128,7 @@ def postprocess(self, pred, img1, img0):
 
                 draw_keypoints(img0, coords, self.coco_skeletons)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def predict(self, image):
         img = self.preprocess(image)
         pred = self.det_model(img)[0]

From 794db12ae88359d0a250054b51dc14e34fa2da2b Mon Sep 17 00:00:00 2001
From: Mathias Nielsen <mathiasesn1@gmail.com>
Date: Wed, 27 Nov 2024 22:45:41 +0100
Subject: [PATCH 6/9] refactor: enhance type hints, improve argument parsing,
 and clean up code formatting

---
 pose/infer.py           | 42 +++++++++++++++++++++++++++++------------
 pose/models/__init__.py | 16 ++++++++--------
 pose/utils/boxes.py     | 19 +++++++++++--------
 pose/utils/decode.py    | 18 ++++++++++--------
 pose/utils/utils.py     | 30 +++++++++++++++++------------
 pyproject.toml          |  1 +
 6 files changed, 78 insertions(+), 48 deletions(-)

diff --git a/pose/infer.py b/pose/infer.py
index 76b10aa..26e5276 100644
--- a/pose/infer.py
+++ b/pose/infer.py
@@ -25,23 +25,26 @@
 class Pose:
     def __init__(
         self,
-        det_model,
-        pose_model,
-        img_size=640,
-        conf_thres=0.25,
-        iou_thres=0.45,
+        det_model: str,
+        pose_model: str,
+        img_size: int = 640,
+        conf_thres: float = 0.25,
+        iou_thres: float = 0.45,
     ) -> None:
         self.img_size = img_size
         self.conf_thres = conf_thres
         self.iou_thres = iou_thres
+
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
         if "yolov5" in det_model:
+            self.det_model_type = "yolov5"
             self.det_model = torch.hub.load(
                 "ultralytics/yolov5", "custom", path=det_model, force_reload=True
             )
             self.det_model = self.det_model.to(self.device)
         else:
+            self.det_model_type = "yolo"
             self.det_model = YOLO(det_model)
             self.det_model = self.det_model.to(self.device)
 
@@ -121,7 +124,7 @@ def postprocess(self, pred, img1, img0):
                 boxes = self.box_to_center_scale(boxes)
                 outputs = self.predict_poses(boxes, img0)
 
-                if "simdr" in self.model_name:
+                if "simdr" in self.model_name.lower():
                     coords = get_simdr_final_preds(*outputs, boxes, self.patch_size)
                 else:
                     coords = get_final_preds(outputs, boxes)
@@ -141,18 +144,29 @@ def argument_parser():
         description="Pose Estimation",
         formatter_class=ArgumentDefaultsHelpFormatter,
     )
-    parser.add_argument("--source", type=str, default="assests/test.jpg")
     parser.add_argument(
-        "--det-model", type=str, default="checkpoints/crowdhuman_yolov5m.pt"
+        "--source",
+        type=str,
+        default="assests/test.jpg",
+        help="Path to image, video or webcam",
+    )
+    parser.add_argument(
+        "--det-model",
+        type=str,
+        default="checkpoints/crowdhuman_yolov5m.pt",
+        help="Human detection model",
     )
     parser.add_argument(
         "--pose-model",
         type=str,
         default="checkpoints/pretrained/simdr_hrnet_w32_256x192.pth",
+        help="Pose estimation model",
+    )
+    parser.add_argument("--img-size", type=int, default=640, help="Image size")
+    parser.add_argument(
+        "--conf-thres", type=float, default=0.5, help="Confidence threshold"
     )
-    parser.add_argument("--img-size", type=int, default=640)
-    parser.add_argument("--conf-thres", type=float, default=0.4)
-    parser.add_argument("--iou-thres", type=float, default=0.5)
+    parser.add_argument("--iou-thres", type=float, default=0.5, help="IOU threshold")
     return parser.parse_args()
 
 
@@ -160,7 +174,11 @@ def main():
     setup_cudnn()
     args = argument_parser()
     pose = Pose(
-        args.det_model, args.pose_model, args.img_size, args.conf_thres, args.iou_thres
+        det_model=args.det_model,
+        pose_model=args.pose_model,
+        img_size=args.img_size,
+        conf_thres=args.conf_thres,
+        iou_thres=args.iou_thres,
     )
 
     source = Path(args.source)
diff --git a/pose/models/__init__.py b/pose/models/__init__.py
index 27f10a0..1cf1921 100644
--- a/pose/models/__init__.py
+++ b/pose/models/__init__.py
@@ -1,15 +1,15 @@
 from .posehrnet import PoseHRNet
 from .simdr import SimDR
 
+__all__ = ["PoseHRNet", "SimDR"]
 
-__all__ = ['PoseHRNet', 'SimDR']
 
-
-def get_pose_model(model_path: str):
-    if 'posehrnet' in model_path:
-        model = PoseHRNet('w32' if 'w32' in model_path else 'w48')
-    elif 'simdr' in model_path:
-        model = SimDR('w32' if 'w32' in model_path else 'w48')
+def get_pose_model(model_path: str) -> PoseHRNet | SimDR:
+    if "posehrnet" in model_path.lower():
+        model = PoseHRNet("w32" if "w32" in model_path.lower() else "w48")
+    elif "simdr" in model_path.lower():
+        model = SimDR("w32" if "w32" in model_path.lower() else "w48")
     else:
         raise NotImplementedError
-    return model
\ No newline at end of file
+
+    return model
diff --git a/pose/utils/boxes.py b/pose/utils/boxes.py
index 0b208be..599b5d5 100644
--- a/pose/utils/boxes.py
+++ b/pose/utils/boxes.py
@@ -1,6 +1,6 @@
 import cv2
-import torch
 import numpy as np
+import torch
 from torchvision import ops
 
 
@@ -18,7 +18,9 @@ def letterbox(img, new_shape=(640, 640)):
 
     top, bottom = round(pH - 0.1), round(pH + 0.1)
     left, right = round(pW - 0.1), round(pW + 0.1)
-    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))
+    img = cv2.copyMakeBorder(
+        img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
+    )
     return img
 
 
@@ -31,7 +33,7 @@ def scale_boxes(boxes, orig_shape, new_shape):
     boxes[:, ::2] -= pad[1]
     boxes[:, 1::2] -= pad[0]
     boxes[:, :4] /= gain
-    
+
     boxes[:, ::2].clamp_(0, orig_shape[1])
     boxes[:, 1::2].clamp_(0, orig_shape[0])
     return boxes.round()
@@ -56,7 +58,7 @@ def xyxy2xywh(x):
 
 
 def non_max_suppression(pred, conf_thres=0.25, iou_thres=0.45, classes=None):
-    candidates = pred[..., 4] > conf_thres 
+    candidates = pred[..., 4] > conf_thres
 
     max_wh = 4096
     max_nms = 30000
@@ -67,10 +69,11 @@ def non_max_suppression(pred, conf_thres=0.25, iou_thres=0.45, classes=None):
     for xi, x in enumerate(pred):
         x = x[candidates[xi]]
 
-        if not x.shape[0]: continue
+        if not x.shape[0]:
+            continue
 
         # compute conf
-        x[:, 5:] *= x[:, 4:5]   # conf = obj_conf * cls_conf
+        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
 
         # box
         box = xywh2xyxy(x[:, :4])
@@ -85,7 +88,7 @@ def non_max_suppression(pred, conf_thres=0.25, iou_thres=0.45, classes=None):
 
         # check shape
         n = x.shape[0]
-        if not n: 
+        if not n:
             continue
         elif n > max_nms:
             x = x[x[:, 4].argsort(descending=True)[:max_nms]]
@@ -100,4 +103,4 @@ def non_max_suppression(pred, conf_thres=0.25, iou_thres=0.45, classes=None):
 
         output[xi] = x[keep]
 
-    return output
\ No newline at end of file
+    return output
diff --git a/pose/utils/decode.py b/pose/utils/decode.py
index d6fa353..ea26258 100644
--- a/pose/utils/decode.py
+++ b/pose/utils/decode.py
@@ -1,10 +1,13 @@
 import math
-import torch
+
 import numpy as np
+import torch
 from torch import Tensor
 
 
-def get_simdr_final_preds(pred_x: Tensor, pred_y: Tensor, boxes: Tensor, image_size: tuple):
+def get_simdr_final_preds(
+    pred_x: Tensor, pred_y: Tensor, boxes: Tensor, image_size: tuple
+):
     center, scale = boxes[:, :2].numpy(), boxes[:, 2:].numpy()
 
     pred_x, pred_y = pred_x.softmax(dim=2), pred_y.softmax(dim=2)
@@ -29,11 +32,10 @@ def get_final_preds(heatmaps: Tensor, boxes: Tensor):
             py = int(math.floor(coords[n][p][1] + 0.5))
 
             if 1 < px < W - 1 and 1 < py < H - 1:
-                diff = np.array([
-                    hm[py][px+1] - hm[py][px-1],
-                    hm[py+1][px] - hm[py-1][px]
-                ])
-                coords[n][p] += np.sign(diff) * .25
+                diff = np.array(
+                    [hm[py][px + 1] - hm[py][px - 1], hm[py + 1][px] - hm[py - 1][px]]
+                )
+                coords[n][p] += np.sign(diff) * 0.25
 
     for i in range(B):
         coords[i] = transform_preds(coords[i], center[i], scale[i], [W, H])
@@ -59,4 +61,4 @@ def transform_preds(coords, center, scale, output_size):
     target_coords = np.ones_like(coords)
     target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5
     target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5
-    return target_coords
\ No newline at end of file
+    return target_coords
diff --git a/pose/utils/utils.py b/pose/utils/utils.py
index f14277c..9fef30b 100644
--- a/pose/utils/utils.py
+++ b/pose/utils/utils.py
@@ -14,27 +14,32 @@ def setup_cudnn() -> None:
 
 
 def draw_coco_keypoints(img, keypoints, skeletons):
-    if keypoints == []: return img
+    if keypoints == []:
+        return img
     image = img.copy()
     for kpts in keypoints:
         for x, y, v in kpts:
             if v == 2:
                 cv2.circle(image, (x, y), 4, (255, 0, 0), 2)
         for kid1, kid2 in skeletons:
-            x1, y1, v1 = kpts[kid1-1]
-            x2, y2, v2 = kpts[kid2-1]
+            x1, y1, v1 = kpts[kid1 - 1]
+            x2, y2, v2 = kpts[kid2 - 1]
             if v1 == 2 and v2 == 2:
-                cv2.line(image, (x1, y1), (x2, y2), (0, 255, 0), 2)   
-    return image 
+                cv2.line(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
+    return image
 
 
 def draw_keypoints(img, keypoints, skeletons):
-    if keypoints == []: return img
+    if len(keypoints) == 0 or (
+        isinstance(keypoints, np.ndarray) and keypoints.size == 0
+    ):
+        return img
+
     for kpts in keypoints:
         for x, y in kpts:
             cv2.circle(img, (x, y), 4, (255, 0, 0), 2, cv2.LINE_AA)
         for kid1, kid2 in skeletons:
-            cv2.line(img, kpts[kid1-1], kpts[kid2-1], (0, 255, 0), 2, cv2.LINE_AA)   
+            cv2.line(img, kpts[kid1 - 1], kpts[kid2 - 1], (0, 255, 0), 2, cv2.LINE_AA)
 
 
 class WebcamStream:
@@ -56,7 +61,7 @@ def __iter__(self):
     def __next__(self):
         self.count += 1
 
-        if cv2.waitKey(1) == ord('q'):
+        if cv2.waitKey(1) == ord("q"):
             self.stop()
 
         return self.frame.copy()
@@ -71,8 +76,8 @@ def __len__(self):
 
 class VideoReader:
     def __init__(self, video: str):
-        self.frames, _, info = io.read_video(video, pts_unit='sec')
-        self.fps = info['video_fps']
+        self.frames, _, info = io.read_video(video, pts_unit="sec")
+        self.fps = info["video_fps"]
 
         print(f"Processing '{video}'...")
         print(f"Total Frames: {len(self.frames)}")
@@ -130,7 +135,8 @@ def stop(self, debug=True):
         self.counts += 1
         if self.counts == self.avg:
             self.fps = round(self.counts / self.accum_time)
-            if debug: print(f"FPS: {self.fps}")
+            if debug:
+                print(f"FPS: {self.fps}")
             self.counts = 0
             self.accum_time = 0
 
@@ -168,4 +174,4 @@ def get_affine_transform(center, scale, patch_size, rot=0, inv=False):
     src[2:, :] = get_3rd_point(src[0, :], src[1, :])
     dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
 
-    return cv2.getAffineTransform(dst, src) if inv else cv2.getAffineTransform(src, dst)
\ No newline at end of file
+    return cv2.getAffineTransform(dst, src) if inv else cv2.getAffineTransform(src, dst)
diff --git a/pyproject.toml b/pyproject.toml
index 76f0150..b2569dd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,6 +9,7 @@ description = "Top-Down Multi-person Pose Estimation"
 readme = "README.md"
 requires-python = ">=3.10.12"
 dependencies = [
+    "gitpython~=3.1.43",
     "numpy~=1.26.4",
     "opencv-python-headless~=4.10.0.84",
     "tqdm~=4.67.0",

From c90f4f61669bd7f24482d1c76b5c3947bdb3e3d1 Mon Sep 17 00:00:00 2001
From: Mathias Nielsen <mathiasesn1@gmail.com>
Date: Thu, 28 Nov 2024 14:15:59 +0100
Subject: [PATCH 7/9] feat: add bounding box drawing function and integrate it
 into pose inference

---
 pose/infer.py        |   4 +-
 pose/models/simdr.py |  27 +++++++---
 pose/utils/boxes.py  | 121 ++++++++++++++++++++++++++-----------------
 pose/utils/utils.py  |  38 ++++++++++++++
 pyproject.toml       |   1 +
 5 files changed, 135 insertions(+), 56 deletions(-)

diff --git a/pose/infer.py b/pose/infer.py
index 26e5276..91a843a 100644
--- a/pose/infer.py
+++ b/pose/infer.py
@@ -16,6 +16,7 @@
     VideoReader,
     VideoWriter,
     WebcamStream,
+    draw_bbox,
     draw_keypoints,
     get_affine_transform,
     setup_cudnn,
@@ -129,7 +130,8 @@ def postprocess(self, pred, img1, img0):
                 else:
                     coords = get_final_preds(outputs, boxes)
 
-                draw_keypoints(img0, coords, self.coco_skeletons)
+                img0 = draw_keypoints(img0, coords, self.coco_skeletons)
+                img0 = draw_bbox(img0, det.cpu().numpy())
 
     @torch.inference_mode()
     def predict(self, image):
diff --git a/pose/models/simdr.py b/pose/models/simdr.py
index 7bb80e8..f43494a 100644
--- a/pose/models/simdr.py
+++ b/pose/models/simdr.py
@@ -1,10 +1,16 @@
 import torch
-from torch import nn, Tensor
+from torch import Tensor, nn
+
 from .backbones import HRNet
 
 
 class SimDR(nn.Module):
-    def __init__(self, backbone: str = 'w32', num_joints: int = 17, image_size: tuple = (256, 192)):
+    def __init__(
+        self,
+        backbone: str = "w32",
+        num_joints: int = 17,
+        image_size: tuple = (256, 192),
+    ):
         super().__init__()
         self.backbone = HRNet(backbone)
         self.final_layer = nn.Conv2d(self.backbone.all_channels[0], num_joints, 1)
@@ -15,14 +21,16 @@ def __init__(self, backbone: str = 'w32', num_joints: int = 17, image_size: tupl
 
     def _init_weights(self, m: nn.Module) -> None:
         if isinstance(m, nn.Conv2d):
-            nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
         elif isinstance(m, nn.BatchNorm2d):
             nn.init.constant_(m.weight, 1)
             nn.init.constant_(m.bias, 0)
 
     def init_pretrained(self, pretrained: str = None) -> None:
         if pretrained:
-            self.backbone.load_state_dict(torch.load(pretrained, map_location='cpu'), strict=False)
+            self.backbone.load_state_dict(
+                torch.load(pretrained, map_location="cpu"), strict=False
+            )
 
     def forward(self, x: Tensor) -> Tensor:
         out = self.backbone(x)
@@ -32,10 +40,13 @@ def forward(self, x: Tensor) -> Tensor:
         return pred_x, pred_y
 
 
-if __name__ == '__main__':
-    from torch.nn import functional as F
-    model = SimDR('w32')
-    model.load_state_dict(torch.load('checkpoints/pretrained/simdr_hrnet_w32_256x192.pth', map_location='cpu'))
+if __name__ == "__main__":
+    model = SimDR("w32")
+    model.load_state_dict(
+        torch.load(
+            "checkpoints/pretrained/simdr_hrnet_w32_256x192.pth", map_location="cpu"
+        )
+    )
     x = torch.randn(4, 3, 256, 192)
     px, py = model(x)
     print(px.shape, py.shape)
diff --git a/pose/utils/boxes.py b/pose/utils/boxes.py
index 599b5d5..951f2ab 100644
--- a/pose/utils/boxes.py
+++ b/pose/utils/boxes.py
@@ -57,50 +57,77 @@ def xyxy2xywh(x):
     return y
 
 
-def non_max_suppression(pred, conf_thres=0.25, iou_thres=0.45, classes=None):
-    candidates = pred[..., 4] > conf_thres
-
-    max_wh = 4096
-    max_nms = 30000
-    max_det = 300
-
-    output = [torch.zeros((0, 6), device=pred.device)] * pred.shape[0]
-
-    for xi, x in enumerate(pred):
-        x = x[candidates[xi]]
-
-        if not x.shape[0]:
-            continue
-
-        # compute conf
-        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
-
-        # box
-        box = xywh2xyxy(x[:, :4])
-
-        # detection matrix nx6
-        conf, j = x[:, 5:].max(1, keepdim=True)
-        x = torch.cat([box, conf, j.float()], dim=1)[conf.view(-1) > conf_thres]
-
-        # filter by class
-        if classes is not None:
-            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
-
-        # check shape
-        n = x.shape[0]
-        if not n:
-            continue
-        elif n > max_nms:
-            x = x[x[:, 4].argsort(descending=True)[:max_nms]]
-
-        # batched nms
-        c = x[:, 5:6] * max_wh
-        boxes, scores = x[:, :4] + c, x[:, 4]
-        keep = ops.nms(boxes, scores, iou_thres)
-
-        if keep.shape[0] > max_det:
-            keep = keep[:max_det]
-
-        output[xi] = x[keep]
-
-    return output
+def non_max_suppression(
+    pred: torch.Tensor,
+    conf_thres: float = 0.25,
+    iou_thres: float = 0.45,
+    classes: list = None,
+    max_det: int = 300,
+) -> list:
+    """
+    Non-Maximum Suppression (NMS) on inference results
+
+    Args:
+        pred: predictions tensor (n,7) [x, y, w, h, obj_conf, cls1_conf, cls2_conf]
+        conf_thres: confidence threshold
+        iou_thres: NMS IoU threshold
+        classes: filter by class (e.g. [0] for persons only)
+        max_det: maximum number of detections per image
+
+    Returns:
+        list of detections, on (n,6) tensor per image [xyxy, conf, cls]
+    """
+    # Ensure pred is 2D
+    if pred.dim() == 1:
+        pred = pred.unsqueeze(0)
+
+    # Calculate confidence
+    conf = pred[:, 4]  # objectness score
+    class_scores = pred[:, 5:]  # class probabilities
+    class_conf, class_pred = class_scores.max(1)  # best class confidence and prediction
+    confidence = conf * class_conf  # combine scores
+
+    # Filter by confidence
+    conf_mask = confidence > conf_thres
+    pred = pred[conf_mask]
+    confidence = confidence[conf_mask]
+    class_pred = class_pred[conf_mask]
+
+    if not pred.shape[0]:  # no boxes
+        return [torch.zeros((0, 6), device=pred.device)]
+
+    # Convert boxes from [x, y, w, h] to [x1, y1, x2, y2]
+    boxes = xywh2xyxy(pred[:, :4])
+
+    # Filter by class
+    if classes is not None:
+        if isinstance(classes, int):
+            classes = [classes]
+        class_mask = torch.zeros_like(class_pred, dtype=torch.bool)
+        for c in classes:
+            class_mask |= class_pred == c
+        boxes = boxes[class_mask]
+        confidence = confidence[class_mask]
+        class_pred = class_pred[class_mask]
+
+    if not boxes.shape[0]:  # no boxes after filtering
+        return [torch.zeros((0, 6), device=pred.device)]
+
+    # Sort by confidence
+    sorted_indices = torch.argsort(confidence, descending=True)
+    boxes = boxes[sorted_indices]
+    confidence = confidence[sorted_indices]
+    class_pred = class_pred[sorted_indices]
+
+    # Apply NMS
+    keep = ops.nms(boxes, confidence, iou_thres)
+    if keep.shape[0] > max_det:
+        keep = keep[:max_det]
+
+    # Combine detections into final format [x1, y1, x2, y2, conf, cls]
+    output = torch.zeros((keep.shape[0], 6), device=pred.device)
+    output[:, :4] = boxes[keep]
+    output[:, 4] = confidence[keep]
+    output[:, 5] = class_pred[keep].float()
+
+    return [output]
diff --git a/pose/utils/utils.py b/pose/utils/utils.py
index 9fef30b..6639a25 100644
--- a/pose/utils/utils.py
+++ b/pose/utils/utils.py
@@ -13,6 +13,42 @@ def setup_cudnn() -> None:
     cudnn.deterministic = False
 
 
+def draw_bbox(
+    img: np.ndarray,
+    boxes: np.ndarray,
+    color: tuple[int, int, int] = (0, 255, 0),
+    thickness: int = 2,
+    font_scale: float = 0.5,
+    text_color: tuple[int, int, int] = (255, 255, 255),
+    text_thickness: int = 1,
+) -> np.ndarray:
+    for box in boxes:
+        x1, y1, x2, y2, conf, _ = map(int, box[:6])
+        cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness)
+        conf_text = f"{conf:.2f}"
+        (text_width, text_height), _ = cv2.getTextSize(
+            conf_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_thickness
+        )
+        cv2.rectangle(
+            img,
+            (x1, y1 - text_height - 5),
+            (x1 + text_width + 5, y1),
+            color,
+            -1,  # Filled rectangle
+        )
+        cv2.putText(
+            img,
+            conf_text,
+            (x1 + 3, y1 - 4),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            font_scale,
+            text_color,
+            text_thickness,
+        )
+
+    return img
+
+
 def draw_coco_keypoints(img, keypoints, skeletons):
     if keypoints == []:
         return img
@@ -41,6 +77,8 @@ def draw_keypoints(img, keypoints, skeletons):
         for kid1, kid2 in skeletons:
             cv2.line(img, kpts[kid1 - 1], kpts[kid2 - 1], (0, 255, 0), 2, cv2.LINE_AA)
 
+    return img
+
 
 class WebcamStream:
     def __init__(self, src=0) -> None:
diff --git a/pyproject.toml b/pyproject.toml
index b2569dd..db7ea20 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,6 +12,7 @@ dependencies = [
     "gitpython~=3.1.43",
     "numpy~=1.26.4",
     "opencv-python-headless~=4.10.0.84",
+    "torch~=2.5.1",
     "tqdm~=4.67.0",
     "ultralytics~=8.3.23",
 ]

From 68a9625e18d50d59510e569bf8ec497ccdef97ff Mon Sep 17 00:00:00 2001
From: Mathias Nielsen <mathiasesn1@gmail.com>
Date: Sat, 30 Nov 2024 14:09:04 +0100
Subject: [PATCH 8/9] refactor: update box conversion functions with type hints
 and use copy instead of clone

---
 pose/utils/boxes.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pose/utils/boxes.py b/pose/utils/boxes.py
index 599b5d5..2f0e332 100644
--- a/pose/utils/boxes.py
+++ b/pose/utils/boxes.py
@@ -39,8 +39,8 @@ def scale_boxes(boxes, orig_shape, new_shape):
     return boxes.round()
 
 
-def xywh2xyxy(x):
-    boxes = x.clone()
+def xywh2xyxy(x: np.ndarray) -> np.ndarray:
+    boxes = x.copy()
     boxes[:, 0] = x[:, 0] - x[:, 2] / 2
     boxes[:, 1] = x[:, 1] - x[:, 3] / 2
     boxes[:, 2] = x[:, 0] + x[:, 2] / 2
@@ -48,8 +48,8 @@ def xywh2xyxy(x):
     return boxes
 
 
-def xyxy2xywh(x):
-    y = x.clone()
+def xyxy2xywh(x: np.ndarray) -> np.ndarray:
+    y = x.copy()
     y[:, 0] = (x[:, 0] + x[:, 2]) / 2  # x center
     y[:, 1] = (x[:, 1] + x[:, 3]) / 2  # y center
     y[:, 2] = x[:, 2] - x[:, 0]  # width

From 01201688fa272a586bb65338b90d1d459883ea67 Mon Sep 17 00:00:00 2001
From: Mathias Nielsen <mathiasesn1@gmail.com>
Date: Sat, 30 Nov 2024 14:14:57 +0100
Subject: [PATCH 9/9] feat: enhance box conversion functions to support both
 numpy arrays and torch tensors

---
 pose/utils/boxes.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/pose/utils/boxes.py b/pose/utils/boxes.py
index 42ba114..784d6b7 100644
--- a/pose/utils/boxes.py
+++ b/pose/utils/boxes.py
@@ -39,21 +39,35 @@ def scale_boxes(boxes, orig_shape, new_shape):
     return boxes.round()
 
 
-def xywh2xyxy(x: np.ndarray) -> np.ndarray:
-    boxes = x.copy()
+def xywh2xyxy(x: np.ndarray | torch.Tensor) -> np.ndarray:
+    if isinstance(x, torch.Tensor):
+        boxes = x.clone()
+    elif isinstance(x, np.ndarray):
+        boxes = x.copy()
+    else:
+        raise TypeError("Input must be a tensor or numpy array")
+
     boxes[:, 0] = x[:, 0] - x[:, 2] / 2
     boxes[:, 1] = x[:, 1] - x[:, 3] / 2
     boxes[:, 2] = x[:, 0] + x[:, 2] / 2
     boxes[:, 3] = x[:, 1] + x[:, 3] / 2
+
     return boxes
 
 
-def xyxy2xywh(x: np.ndarray) -> np.ndarray:
-    y = x.copy()
+def xyxy2xywh(x: np.ndarray | torch.Tensor) -> np.ndarray:
+    if isinstance(x, torch.Tensor):
+        y = x.clone()
+    elif isinstance(x, np.ndarray):
+        y = x.copy()
+    else:
+        raise TypeError("Input must be a tensor or numpy array")
+
     y[:, 0] = (x[:, 0] + x[:, 2]) / 2  # x center
     y[:, 1] = (x[:, 1] + x[:, 3]) / 2  # y center
     y[:, 2] = x[:, 2] - x[:, 0]  # width
     y[:, 3] = x[:, 3] - x[:, 1]  # height
+
     return y