Merge pull request #49 from JigsawStack/feat/obj_detection

Khurdhula-Harshavardhan · web-flow · commit 38d1c24a1f6d · 2025-06-24T13:44:33.000-04:00
added object detection
diff --git a/jigsawstack/__init__.py b/jigsawstack/__init__.py
@@ -119,6 +119,7 @@ def __init__(
         ).image_generation
 
 
+
 class AsyncJigsawStack:
     validate: AsyncValidate
     web: AsyncWeb
@@ -229,5 +230,6 @@ def __init__(
         ).image_generation
 
 
+
 # Create a global instance of the Web class
 __all__ = ["JigsawStack", "Search", "JigsawStackError", "AsyncJigsawStack"]
diff --git a/jigsawstack/_client.py b/jigsawstack/_client.py
diff --git a/jigsawstack/audio.py b/jigsawstack/audio.py
@@ -4,7 +4,7 @@
 from .async_request import AsyncRequest, AsyncRequestConfig
 from ._config import ClientConfig
 from typing import Any, Dict, List, cast
-from typing_extensions import NotRequired, TypedDict
+from typing_extensions import NotRequired, TypedDict, Literal
 from .custom_typing import SupportedAccents
 from .helpers import build_path
 
@@ -14,6 +14,7 @@ class TextToSpeechParams(TypedDict):
     accent: NotRequired[SupportedAccents]
     speaker_clone_url: NotRequired[str]
     speaker_clone_file_store_key: NotRequired[str]
+    return_type: NotRequired[Literal["url", "binary", "base64"]]
 
 
 class TTSCloneParams(TypedDict):
diff --git a/jigsawstack/image_generation.py b/jigsawstack/image_generation.py
@@ -53,6 +53,8 @@ class ImageGenerationParams(TypedDict):
     File store key to use as image input.
     """
 
+    return_type: NotRequired[Literal["url", "binary", "base64"]]
+
 class ImageGenerationResponse(TypedDict):
     success: bool
     """
diff --git a/jigsawstack/search.py b/jigsawstack/search.py
@@ -109,7 +109,7 @@ class SearchParams(TypedDict):
     Two-letter country code to localize search results (e.g. 'US', 'GB')
     """
 
-    auto_scrape: bool
+    auto_scrape: NotRequired[bool]
     """
     Whether to automatically scrape content from search result URLs
     """
diff --git a/jigsawstack/translate.py b/jigsawstack/translate.py
@@ -1,5 +1,5 @@
 from typing import Any, Dict, List, Union, cast, overload
-from typing_extensions import NotRequired, TypedDict
+from typing_extensions import NotRequired, TypedDict, Literal
 from .request import Request, RequestConfig
 from .async_request import AsyncRequest
 from typing import List, Union
@@ -20,6 +20,8 @@ class TranslateImageParams(TypedDict):
     The file store key of the image to translate.
     """
 
+    return_type: NotRequired[Literal["url", "binary", "base64"]]
+
 class TranslateParams(TypedDict):
     target_language: str
     """
diff --git a/jigsawstack/vision.py b/jigsawstack/vision.py
@@ -1,15 +1,128 @@
 from typing import Any, Dict, List, Union, cast, Optional
-from typing_extensions import NotRequired, TypedDict
+from typing_extensions import NotRequired, TypedDict, Literal
 from typing import Any, Dict, List, cast
-from typing_extensions import NotRequired, TypedDict
+from typing_extensions import NotRequired, TypedDict, Literal
 from .request import Request, RequestConfig
 from .async_request import AsyncRequest, AsyncRequestConfig
 from ._config import ClientConfig
 
 
-class OCRParams(TypedDict):
+class Point(TypedDict):
+    x: int
+    """
+    X coordinate of the point
+    """
+    
+    y: int
+    """
+    Y coordinate of the point
+    """
+
+
+class BoundingBox(TypedDict):
+    top_left: Point
+    """
+    Top-left corner of the bounding box
+    """
+    
+    top_right: Point
+    """
+    Top-right corner of the bounding box
+    """
+    
+    bottom_left: Point
+    """
+    Bottom-left corner of the bounding box
+    """
+    
+    bottom_right: Point
+    """
+    Bottom-right corner of the bounding box
+    """
+    
+    width: int
+    """
+    Width of the bounding box
+    """
+    
+    height: int
+    """
+    Height of the bounding box
+    """
+
+
+class GuiElement(TypedDict):
+    bounds: BoundingBox
+    """
+    Bounding box coordinates of the GUI element
+    """
+    
+    content: Union[str, None]
+    """
+    Content of the GUI element, can be null if no object detected
+    """
+
+
+class DetectedObject(TypedDict):
+    bounds: BoundingBox
+    """
+    Bounding box coordinates of the detected object
+    """
+    
+    mask: NotRequired[str]
+    """
+    URL or base64 string depending on return_type - only present for some objects
+    """
+
+
+
+class ObjectDetectionParams(TypedDict):
     url: NotRequired[str]
+    """
+    URL of the image to process
+    """
+    
     file_store_key: NotRequired[str]
+    """
+    File store key of the image to process
+    """
+    
+    prompts: NotRequired[List[str]]
+    """
+    List of prompts for object detection
+    """
+    
+    features: NotRequired[List[Literal["object_detection", "gui"]]]
+    """
+    List of features to enable: object_detection, gui
+    """
+    
+    annotated_image: NotRequired[bool]
+    """
+    Whether to return an annotated image
+    """
+    
+    return_type: NotRequired[Literal["url", "base64"]]
+    """
+    Format for returned images: url or base64
+    """
+
+
+class ObjectDetectionResponse(TypedDict):
+    annotated_image: NotRequired[str]
+    """
+    URL or base64 string of annotated image (included only if annotated_image=true and objects/gui_elements exist)
+    """
+    
+    gui_elements: NotRequired[List[GuiElement]]
+    """
+    List of detected GUI elements (included only if features includes "gui")
+    """
+    
+    objects: NotRequired[List[DetectedObject]]
+    """
+    List of detected objects (included only if features includes "object_detection")
+    """
 
 
 class VOCRParams(TypedDict):
@@ -60,7 +173,7 @@ def vocr(self, params: VOCRParams) -> OCRResponse:
         ).perform_with_content()
         return resp
 
-    def object_detection(self, params: OCRParams) -> OCRResponse:
+    def object_detection(self, params: ObjectDetectionParams) -> ObjectDetectionResponse:
         path = "/ai/object_detection"
         resp = Request(
             config=self.config,
@@ -97,9 +210,9 @@ async def vocr(self, params: VOCRParams) -> OCRResponse:
         ).perform_with_content()
         return resp
 
-    async def object_detection(self, params: OCRParams) -> OCRResponse:
+    async def object_detection(self, params: ObjectDetectionParams) -> ObjectDetectionResponse:
         path = "/ai/object_detection"
-        resp = AsyncRequest(
+        resp = await AsyncRequest(
             config=self.config,
             path=path,
             params=cast(Dict[Any, Any], params),
diff --git a/jigsawstack/web.py b/jigsawstack/web.py
@@ -42,9 +42,9 @@ class DNSResponse(TypedDict):
 # HTML to Any
 #
 class HTMLToAnyParams(TypedDict):
-    html: str
-    url: str
-    goto_options: NotRequired[object]
+    html: NotRequired[str]
+    url: NotRequired[str]
+    goto_options: NotRequired[Dict[str, Union[int, str]]]
     scale: NotRequired[int]
     full_page: NotRequired[bool]
     omit_background: NotRequired[bool]
@@ -59,6 +59,7 @@ class HTMLToAnyParams(TypedDict):
     is_mobile: NotRequired[bool]
     dark_mode: NotRequired[bool]
     use_graphic_renderer: NotRequired[bool]
+    return_type: NotRequired[Literal["url", "binary", "base64"]]
 
 
 class HTMLToAnyResponse(TypedDict):
diff --git a/tests/test_object_detection.py b/tests/test_object_detection.py
@@ -0,0 +1,36 @@
+from unittest.mock import MagicMock
+import unittest
+from jigsawstack.exceptions import JigsawStackError
+import jigsawstack
+import pytest
+import asyncio
+import logging
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+jigsaw = jigsawstack.JigsawStack()
+async_jigsaw = jigsawstack.AsyncJigsawStack()
+
+
+def test_object_detection_response():
+    try:
+        result = jigsaw.vision.object_detection({"url": "https://rogilvkqloanxtvjfrkm.supabase.co/storage/v1/object/public/demo/Collabo%201080x842.jpg"})
+        print(result)
+        assert result["success"] == True
+    except JigsawStackError as e:
+        pytest.fail(f"Unexpected JigsawStackError: {e}")
+
+
+def test_object_detection_response_async():
+    async def _test():
+        client = jigsawstack.AsyncJigsawStack()
+        try:
+            result = await client.vision.object_detection({"url": "https://rogilvkqloanxtvjfrkm.supabase.co/storage/v1/object/public/demo/Collabo%201080x842.jpg"})
+            print(result)
+            assert result["success"] == True
+        except JigsawStackError as e:
+            pytest.fail(f"Unexpected JigsawStackError: {e}")
+
+    asyncio.run(_test())
+
diff --git a/tests/test_search.py b/tests/test_search.py
@@ -14,6 +14,22 @@
 
 
 def test_search_suggestion_response():
+    try:
+        result = jigsaw.web.search({"query": "Where is San Francisco"})
+        assert result["success"] == True
+    except JigsawStackError as e:
+        pytest.fail(f"Unexpected JigsawStackError: {e}")
+
+
+def test_ai_search_response():
+    try:
+        result = jigsaw.web.search({"query": "Where is San Francisco"})
+        assert result["success"] == True  
+    except JigsawStackError as e:
+        pytest.fail(f"Unexpected JigsawStackError: {e}")
+
+
+def test_search_suggestion_response_async():
     async def _test():
         client = jigsawstack.AsyncJigsawStack()
         try:
@@ -25,7 +41,7 @@ async def _test():
     asyncio.run(_test())
 
 
-def test_ai_search_response():
+def test_ai_search_response_async():
     async def _test():
         client = jigsawstack.AsyncJigsawStack()
         try: