From 079e31e77ae7d7d47d5c2fe650b116cc83865fd3 Mon Sep 17 00:00:00 2001
From: carmichaelong <carmichael.ong@gmail.com>
Date: Tue, 12 Nov 2024 17:15:11 -0800
Subject: [PATCH 01/19] add mapping for genders in metadata

---
 utils.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/utils.py b/utils.py
index 3f85d39c..a5ad223e 100644
--- a/utils.py
+++ b/utils.py
@@ -360,8 +360,7 @@ def getMetadataFromServer(session_id,justCheckerParams=False):
     session_desc = importMetadata(defaultMetadataPath)
     
     # Get session-specific metadata from api.
-
-    session = getSessionJson(session_id)
+    session = getSessionJson(session_id) 
     if session['meta'] is not None:
         if not justCheckerParams:
             # Backward compatibility
@@ -370,7 +369,7 @@ def getMetadataFromServer(session_id,justCheckerParams=False):
                 session_desc["mass_kg"] = float(session['meta']['subject']['mass'])
                 session_desc["height_m"] = float(session['meta']['subject']['height'])
                 if 'gender' in session['meta']['subject']:
-                    session_desc["gender_mf"] = session['meta']['subject']['gender']
+                    session_desc["gender_mf"] = getGendersDict().get(session['meta']['subject']['gender'])
                 # Before implementing the subject feature, the posemodel was stored
                 # in session['meta']['subject']. After implementing the subject
                 # feature, the posemodel is stored in session['meta']['settings']
@@ -404,7 +403,7 @@ def getMetadataFromServer(session_id,justCheckerParams=False):
                 session_desc["subjectID"] = subject_info['name']
                 session_desc["mass_kg"] = subject_info['weight']
                 session_desc["height_m"] = subject_info['height']
-                session_desc["gender_mf"] = subject_info['gender']
+                session_desc["gender_mf"] = getGendersDict().get(subject_info['gender'])
                 try:
                     session_desc["posemodel"] = session['meta']['settings']['posemodel']
                 except:
@@ -1617,6 +1616,16 @@ def get_entry_with_largest_number(trialList):
 
     return max_entry
 
+def getGendersDict():
+    genders_dict = {
+          "woman": "Woman",
+          "man": "Man",
+          "transgender": "Transgender",
+          "non-binary": "Non-Binary/Non-Conforming",
+          "prefer-not-respond": "Prefer not to respond",
+        }
+    return genders_dict
+
 # Get local client info and update
 
 def getCommitHash():

From f9e32be2bfaf2ae8e7db6b8c704ee7945733f666 Mon Sep 17 00:00:00 2001
From: carmichaelong <carmichael.ong@gmail.com>
Date: Wed, 20 Nov 2024 15:07:01 -0800
Subject: [PATCH 02/19] add makeRequestWithRetry and initial test

---
 .gitignore |  2 --
 utils.py   | 44 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index e94340b0..d1e705b1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,6 +22,4 @@ Examples/reprocessDataServer.py
 *.ini
 *.stats
 
-tests/
-
 newsletter.py
diff --git a/utils.py b/utils.py
index 3f85d39c..e3b56828 100644
--- a/utils.py
+++ b/utils.py
@@ -16,6 +16,7 @@
 import numpy as np
 import pandas as pd
 from scipy import signal
+from urllib3.util.retry import Retry
 
 from utilsAuth import getToken
 from utilsAPI import getAPIURL
@@ -1654,3 +1655,46 @@ def postProcessedDuration(trial_url, duration):
             headers = {"Authorization": "Token {}".format(API_TOKEN)})
     
     return r
+
+# utils for common HTTP requests
+def makeRequestWithRetry(method, url,
+                         headers=None, data=None, params=None,
+                         retries=5, backoff_factor=1):
+    """
+    Makes an HTTP request with retry logic and returns the Response object.
+
+    Args:
+        method (str): HTTP method (e.g., 'GET', 'POST', 'PUT', etc.).
+        url (str): The endpoint URL.
+        headers (dict): Headers to include in the request.
+        data (dict): Data to send in the request body (for POST/PUT requests).
+        params (dict): URL query parameters.
+        retries (int): Number of retry attempts.
+        backoff_factor (float): Backoff factor for exponential delays.
+
+    Returns:
+        requests.Response: The response object for further processing.
+    """
+    try:
+        retry_strategy = Retry(
+            total=retries,
+            backoff_factor=backoff_factor,
+            status_forcelist=[429, 500, 502, 503, 504]
+        )
+
+        adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy)
+        with requests.Session() as session:
+            session.mount("https://", adapter)
+            response = session.request(method=method,
+                                       url=url,
+                                       headers=headers,
+                                       data=data,
+                                       params=params)
+        response.raise_for_status()
+        return response
+            
+    except requests.exceptions.HTTPError as e:
+        raise Exception(f"HTTP error occurred: {e}") 
+    
+    except Exception as e:
+        raise Exception(f"An error occurred: {e}")

From e9fceecc5475c03e4c9bb87fca91c505a7f1b019 Mon Sep 17 00:00:00 2001
From: carmichaelong <carmichael.ong@gmail.com>
Date: Wed, 20 Nov 2024 15:28:27 -0800
Subject: [PATCH 03/19] add tests folder

---
 tests/test_api.py | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 tests/test_api.py

diff --git a/tests/test_api.py b/tests/test_api.py
new file mode 100644
index 00000000..41a727b2
--- /dev/null
+++ b/tests/test_api.py
@@ -0,0 +1,9 @@
+from utils import makeRequestWithRetry
+
+def test_makeRequestWithRetry(self):
+    test_URL = 'http://httpbin.org/status/404'
+
+    makeRequestWithRetry('GET', test_URL)
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file

From 6ff69d4c6897c33702ba29570bcee1c9e7ce3709 Mon Sep 17 00:00:00 2001
From: carmichaelong <carmichael.ong@gmail.com>
Date: Wed, 20 Nov 2024 17:09:45 -0800
Subject: [PATCH 04/19] add pytest and fill out tests for requests retry

---
 requirements.txt  |  3 ++-
 tests/test_api.py | 26 ++++++++++++++++++++++----
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 3d626337..0937e2f6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,4 +15,5 @@ pingouin==0.5.2
 openpyxl
 ffmpeg-python
 psutil
-boto3
\ No newline at end of file
+boto3
+pytest
diff --git a/tests/test_api.py b/tests/test_api.py
index 41a727b2..24c0f92c 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -1,9 +1,27 @@
+import logging
+import pytest
+import requests
+
 from utils import makeRequestWithRetry
 
-def test_makeRequestWithRetry(self):
-    test_URL = 'http://httpbin.org/status/404'
+class TestMakeRequestWithRetry:
+    logging.getLogger('urllib3').setLevel(logging.DEBUG)
 
-    makeRequestWithRetry('GET', test_URL)
+    def test_get(self):
+        # should work and return 200
+        response = makeRequestWithRetry('GET', 'https://httpbin.org/get')
+        assert response.status_code == 200
+        assert response.json()['url'] == 'https://httpbin.org/get'
+    
+    def test_500(self):
+        # use an error code that triggers retries (should show up in debug log)
+        with pytest.raises(Exception):
+            response_500 = makeRequestWithRetry('GET', 'https://httpbin.org/status/500', retries=3, backoff_factor=0.1)
+    
+    def test_404(self):
+        # error code that won't trigger retries
+        with pytest.raises(Exception):
+            response_404 = makeRequestWithRetry('GET', 'https://httpbin.org/status/404', retires=2, backoff_factor=0.01)
 
 if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
+    unittest.main()

From 484f2bf61d9b6da5ef4a484a916c26f0fca4c910 Mon Sep 17 00:00:00 2001
From: carmichaelong <carmichael.ong@gmail.com>
Date: Fri, 22 Nov 2024 13:23:07 -0800
Subject: [PATCH 05/19] use mock for tests. add post to allowed methods for
 retry

---
 tests/test_api.py | 81 +++++++++++++++++++++++++++++++++++++----------
 utils.py          |  7 ++--
 2 files changed, 68 insertions(+), 20 deletions(-)

diff --git a/tests/test_api.py b/tests/test_api.py
index 24c0f92c..232943d6 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -1,27 +1,74 @@
 import logging
 import pytest
 import requests
+from unittest.mock import patch, Mock, ANY
+from http.client import HTTPMessage
 
 from utils import makeRequestWithRetry
 
 class TestMakeRequestWithRetry:
     logging.getLogger('urllib3').setLevel(logging.DEBUG)
 
-    def test_get(self):
-        # should work and return 200
-        response = makeRequestWithRetry('GET', 'https://httpbin.org/get')
+    @patch("requests.Session.request")
+    def test_get(self, mock_response):
+        status_code = 200
+        mock_response.return_value.status_code = status_code
+
+        response = makeRequestWithRetry('GET', 'https://test.com', retries=2)
+        assert response.status_code == status_code
+        mock_response.assert_called_once_with('GET', 'https://test.com',
+                                             headers=ANY, data=ANY, params=ANY)
+
+    @patch("requests.Session.request")
+    def test_put(self, mock_response):
+        status_code = 201
+        mock_response.return_value.status_code = status_code
+        
+        data = {
+            "key1": "value1",
+            "key2": "value2"
+        }
+
+        params = {
+            "param1": "value1"
+        }
+        
+        response = makeRequestWithRetry('POST', 
+                                        'https://test.com',
+                                        data=data,
+                                        headers={"Authorization": "my_token"},
+                                        params=params,
+                                        retries=2)
+        
+        assert response.status_code == status_code
+        mock_response.assert_called_once_with('POST', 
+                                             'https://test.com',
+                                             data=data,
+                                             headers={"Authorization": "my_token"},
+                                             params=params)
+
+    @patch("urllib3.connectionpool.HTTPConnectionPool._get_conn")
+    def test_success_after_retries(self, mock_response):
+        mock_response.return_value.getresponse.side_effect = [
+            Mock(status=500, msg=HTTPMessage()),
+            Mock(status=502, msg=HTTPMessage()),
+            Mock(status=200, msg=HTTPMessage()),
+            Mock(status=429, msg=HTTPMessage()),
+        ]
+
+        response = makeRequestWithRetry('GET', 
+                                        'https://test.com', 
+                                        retries=5, 
+                                        backoff_factor=0.1)
+
         assert response.status_code == 200
-        assert response.json()['url'] == 'https://httpbin.org/get'
-    
-    def test_500(self):
-        # use an error code that triggers retries (should show up in debug log)
-        with pytest.raises(Exception):
-            response_500 = makeRequestWithRetry('GET', 'https://httpbin.org/status/500', retries=3, backoff_factor=0.1)
-    
-    def test_404(self):
-        # error code that won't trigger retries
-        with pytest.raises(Exception):
-            response_404 = makeRequestWithRetry('GET', 'https://httpbin.org/status/404', retires=2, backoff_factor=0.01)
-
-if __name__ == '__main__':
-    unittest.main()
+        assert mock_response.call_count == 3
+
+    # comment out test since httpbin can be unstable and we don't want to rely
+    # on it for tests. uncomment and see debug log to see retry attempts
+    '''def test_httpbin(self):
+        response = makeRequestWithRetry('GET', 
+                                        'https://httpbin.org/status/500', 
+                                        retries=4, 
+                                        backoff_factor=0.1)
+    '''
\ No newline at end of file
diff --git a/utils.py b/utils.py
index e3b56828..0c4ab636 100644
--- a/utils.py
+++ b/utils.py
@@ -1679,14 +1679,15 @@ def makeRequestWithRetry(method, url,
         retry_strategy = Retry(
             total=retries,
             backoff_factor=backoff_factor,
-            status_forcelist=[429, 500, 502, 503, 504]
+            status_forcelist=[429, 500, 502, 503, 504],
+            allowed_methods={'DELETE', 'GET', 'POST', 'PUT'}
         )
 
         adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy)
         with requests.Session() as session:
             session.mount("https://", adapter)
-            response = session.request(method=method,
-                                       url=url,
+            response = session.request(method,
+                                       url,
                                        headers=headers,
                                        data=data,
                                        params=params)

From bda3d82cb2596c7cee0da7aa7e50eeda31041097 Mon Sep 17 00:00:00 2001
From: carmichaelong <carmichael.ong@gmail.com>
Date: Fri, 22 Nov 2024 14:46:17 -0800
Subject: [PATCH 06/19] replace requests calls with retry wrapper

---
 app.py            |  25 ++++++++----
 tests/test_api.py |   8 +++-
 utils.py          | 102 ++++++++++++++++++++++++++++++----------------
 utilsChecker.py   |   6 ++-
 utilsServer.py    |  68 ++++++++++++++++++++-----------
 5 files changed, 139 insertions(+), 70 deletions(-)

diff --git a/app.py b/app.py
index 7a636cf7..5b7d26c4 100644
--- a/app.py
+++ b/app.py
@@ -14,7 +14,7 @@
 from utils import (getDataDirectory, checkTime, checkResourceUsage,
                   sendStatusEmail, checkForTrialsWithStatus,
                   getCommitHash, getHostname, postLocalClientInfo,
-                  postProcessedDuration)
+                  postProcessedDuration, makeRequestWithRetry)
 
 logging.basicConfig(level=logging.INFO)
 
@@ -67,8 +67,9 @@
     # no query string -> defaults to 'all'
     queue_path = "trials/dequeue/?workerType=" + workerType
     try:
-        r = requests.get("{}{}".format(API_URL, queue_path),
-                         headers = {"Authorization": "Token {}".format(API_TOKEN)})
+        r = makeRequestWithRetry('GET',
+                                 "{}{}".format(API_URL, queue_path),
+                                 headers = {"Authorization": "Token {}".format(API_TOKEN)})
     except Exception as e:
         traceback.print_exc()
         time.sleep(15)
@@ -120,8 +121,10 @@
         error_msg['error_msg'] = 'No videos uploaded. Ensure phones are connected and you have stable internet connection.'
         error_msg['error_msg_dev'] = 'No videos uploaded.'
 
-        r = requests.patch(trial_url, data={"status": "error", "meta": json.dumps(error_msg)},
-                         headers = {"Authorization": "Token {}".format(API_TOKEN)})
+        r = makeRequestWithRetry('PATCH',
+                                 trial_url,
+                                 data={"status": "error", "meta": json.dumps(error_msg)},
+                                 headers = {"Authorization": "Token {}".format(API_TOKEN)})
         continue
 
     # The following is now done in main, to allow reprocessing trials with missing videos
@@ -149,14 +152,18 @@
 
         # note a result needs to be posted for the API to know we finished, but we are posting them 
         # automatically thru procesTrial now
-        r = requests.patch(trial_url, data={"status": "done"},
-                         headers = {"Authorization": "Token {}".format(API_TOKEN)})
+        r = makeRequestWithRetry('PATCH',
+                                 trial_url,
+                                 data={"status": "done"},
+                                 headers = {"Authorization": "Token {}".format(API_TOKEN)})
+
         logging.info('0.5s pause if need to restart.')
         time.sleep(0.5)
 
     except Exception as e:
-        r = requests.patch(trial_url, data={"status": "error"},
-                         headers = {"Authorization": "Token {}".format(API_TOKEN)})
+        r = makeRequestWithRetry('PATCH',
+                                 trial_url, data={"status": "error"},
+                                 headers = {"Authorization": "Token {}".format(API_TOKEN)})
         traceback.print_exc()
 
         # Antoine: Removing this, it is too often causing the machines to stop. Not because
diff --git a/tests/test_api.py b/tests/test_api.py
index 232943d6..e9e38c02 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -17,7 +17,10 @@ def test_get(self, mock_response):
         response = makeRequestWithRetry('GET', 'https://test.com', retries=2)
         assert response.status_code == status_code
         mock_response.assert_called_once_with('GET', 'https://test.com',
-                                             headers=ANY, data=ANY, params=ANY)
+                                              headers=None,
+                                              data=None,
+                                              params=None,
+                                              files=None)
 
     @patch("requests.Session.request")
     def test_put(self, mock_response):
@@ -45,7 +48,8 @@ def test_put(self, mock_response):
                                              'https://test.com',
                                              data=data,
                                              headers={"Authorization": "my_token"},
-                                             params=params)
+                                             params=params,
+                                             files=None)
 
     @patch("urllib3.connectionpool.HTTPConnectionPool._get_conn")
     def test_success_after_retries(self, mock_response):
diff --git a/utils.py b/utils.py
index 0c4ab636..81be7d96 100644
--- a/utils.py
+++ b/utils.py
@@ -103,13 +103,17 @@ def download_file(url, file_name):
         shutil.copyfileobj(response, out_file)
         
 def getTrialJson(trial_id):
-    trialJson = requests.get(API_URL + "trials/{}/".format(trial_id),
-                         headers = {"Authorization": "Token {}".format(API_TOKEN)}).json()
+    response = makeRequestWithRetry('GET',
+                                    API_URL + "trials/{}/".format(trial_id),
+                                    headers = {"Authorization": "Token {}".format(API_TOKEN)})
+    trialJson = response.json()
     return trialJson
 
 def getSessionJson(session_id):
-    sessionJson = requests.get(API_URL + "sessions/{}/".format(session_id),
-                       headers = {"Authorization": "Token {}".format(API_TOKEN)}).json()
+    response = makeRequestWithRetry('GET',
+                                    API_URL + "sessions/{}/".format(session_id),
+                                    headers = {"Authorization": "Token {}".format(API_TOKEN)})
+    sessionJson = response.json()
     
     # sort trials by time recorded
     def getCreatedAt(trial):
@@ -119,8 +123,10 @@ def getCreatedAt(trial):
     return sessionJson
 
 def getSubjectJson(subject_id):
-    subjectJson = requests.get(API_URL + "subjects/{}/".format(subject_id),
-                       headers = {"Authorization": "Token {}".format(API_TOKEN)}).json()
+    response = makeRequestWithRetry('GET',
+                                    API_URL + "subjects/{}/".format(subject_id),
+                                    headers = {"Authorization": "Token {}".format(API_TOKEN)})
+    subjectJson = response.json()
     return subjectJson
     
 def getTrialName(trial_id):
@@ -183,8 +189,10 @@ def postCalibrationOptions(session_path,session_id,overwrite=False):
                 "meta":json.dumps({'calibration':calibOptionsJson})
             }
         trial_url = "{}{}{}/".format(API_URL, "trials/", calibration_id)
-        r= requests.patch(trial_url, data=data,
-              headers = {"Authorization": "Token {}".format(API_TOKEN)})
+        r = makeRequestWithRetry('PATCH',
+                                 trial_url,
+                                 data=data,
+                                 headers = {"Authorization": "Token {}".format(API_TOKEN)})
         
         if r.status_code == 200:
             print('Wrote calibration selections to metadata.')
@@ -458,8 +466,9 @@ def deleteResult(trial_id, tag=None,resultNum=None):
         resultNums = [r['id'] for r in trial['results']]
 
     for rNum in resultNums:
-        requests.delete(API_URL + "results/{}/".format(rNum),
-                        headers = {"Authorization": "Token {}".format(API_TOKEN)})
+        makeRequestWithRetry('DELETE',
+                             API_URL + "results/{}/".format(rNum),
+                             headers = {"Authorization": "Token {}".format(API_TOKEN)})
         
 def deleteAllResults(session_id):
 
@@ -687,8 +696,10 @@ def changeSessionMetadata(session_ids,newMetaDict):
         
         data = {"meta":json.dumps(existingMeta)}
         
-        r= requests.patch(session_url, data=data,
-              headers = {"Authorization": "Token {}".format(API_TOKEN)})
+        r = makeRequestWithRetry('PATCH',
+                                 session_url,
+                                 data=data,
+                                 headers = {"Authorization": "Token {}".format(API_TOKEN)})
         
         if r.status_code !=200:
             print('Changing metadata failed.')
@@ -735,9 +746,11 @@ def makeSessionPublic(session_id,publicStatus=True):
     data = {
             "public":publicStatus
         }
-        
-    r= requests.patch(session_url, data=data,
-          headers = {"Authorization": "Token {}".format(API_TOKEN)})
+    
+    r = makeRequestWithRetry('PATCH',
+                             session_url,
+                             data=data,
+                             headers = {"Authorization": "Token {}".format(API_TOKEN)})
     
     if r.status_code == 200:
         print('Successfully made ' + session_id + ' public.')
@@ -861,11 +874,17 @@ def postFileToTrial(filePath,trial_id,tag,device_id):
         
     # get S3 link
     data = {'fileName':os.path.split(filePath)[1]}
-    r = requests.get(API_URL + "sessions/null/get_presigned_url/",data=data).json()
+    response = makeRequestWithRetry('GET',
+                                    API_URL + "sessions/null/get_presigned_url/",
+                                    data=data)
+    r = response.json()
     
     # upload to S3
     files = {'file': open(filePath, 'rb')}
-    requests.post(r['url'], data=r['fields'],files=files)   
+    makeRequestWithRetry('POST',
+                         r['url'],
+                         data=r['fields'],
+                         files=files)
     files["file"].close()
 
     # post link to and data to results   
@@ -876,8 +895,10 @@ def postFileToTrial(filePath,trial_id,tag,device_id):
         "media_url" : r['fields']['key']
     }
     
-    rResult = requests.post(API_URL + "results/", data=data,
-                  headers = {"Authorization": "Token {}".format(API_TOKEN)})
+    rResult = makeRequestWithRetry('POST',
+                                   API_URL + "results/", 
+                                   data=data,
+                                   headers = {"Authorization": "Token {}".format(API_TOKEN)})
     
     if rResult.status_code != 201:
         print('server response was + ' + str(r.status_code))
@@ -1484,8 +1505,11 @@ def checkForTrialsWithStatus(status,hours=9999999,relativeTime='newer'):
               'justNumber':1,
               'relativeTime':relativeTime}
     
-    r = requests.get(API_URL+"trials/get_trials_with_status/",params=params,
-        headers = {"Authorization": "Token {}".format(API_TOKEN)}).json()
+    response = makeRequestWithRetry('GET',
+                                    API_URL+"trials/get_trials_with_status/",
+                                    params=params,
+                                    headers = {"Authorization": "Token {}".format(API_TOKEN)})
+    r = response.json()
     
     return r['nTrials']
 
@@ -1565,8 +1589,10 @@ def checkCudaTF():
 # %% Some functions for loading subject data
 
 def getSubjectNumber(subjectName):
-    subjects = requests.get(API_URL + "subjects/",
-                           headers = {"Authorization": "Token {}".format(API_TOKEN)}).json()
+    response = makeRequestWithRetry('GET',
+                                    API_URL + "subjects/",
+                                    headers = {"Authorization": "Token {}".format(API_TOKEN)})
+    subjects = response.json()
     sNum = [s['id'] for s in subjects if s['name'] == subjectName]
     if len(sNum)>1:
         print(len(sNum) + ' subjects with the name ' + subjectName + '. Will use the first one.')   
@@ -1576,8 +1602,10 @@ def getSubjectNumber(subjectName):
     return sNum[0]
 
 def getUserSessions():
-    sessionJson = requests.get(API_URL + "sessions/valid/",
-                           headers = {"Authorization": "Token {}".format(API_TOKEN)}).json()
+    response = makeRequestWithRetry('GET',
+                                    API_URL + "sessions/valid/",
+                                    headers = {"Authorization": "Token {}".format(API_TOKEN)})
+    sessionJson = response.json()
     return sessionJson
 
 def getSubjectSessions(subjectName):
@@ -1639,8 +1667,10 @@ def postLocalClientInfo(trial_url):
             "git_commit": getCommitHash(),
             "hostname": getHostname()
         }
-    r = requests.patch(trial_url, data=data,
-          headers = {"Authorization": "Token {}".format(API_TOKEN)})
+    r = makeRequestWithRetry('PATCH',
+                             trial_url,
+                             data=data,
+                             headers = {"Authorization": "Token {}".format(API_TOKEN)})
     
     return r
 
@@ -1651,23 +1681,26 @@ def postProcessedDuration(trial_url, duration):
     data = {
         "processed_duration": duration
     }
-    r = requests.patch(trial_url, data=data,
-            headers = {"Authorization": "Token {}".format(API_TOKEN)})
+    r = makeRequestWithRetry('PATCH',
+                             trial_url,
+                             data=data,
+                             headers = {"Authorization": "Token {}".format(API_TOKEN)})
     
     return r
 
 # utils for common HTTP requests
 def makeRequestWithRetry(method, url,
-                         headers=None, data=None, params=None,
+                         headers=None, data=None, params=None, files=None,
                          retries=5, backoff_factor=1):
     """
     Makes an HTTP request with retry logic and returns the Response object.
 
     Args:
-        method (str): HTTP method (e.g., 'GET', 'POST', 'PUT', etc.).
+        method (str): HTTP method (e.g., 'GET', 'POST', 'PUT', etc.) as used in 
+            requests.Session().request()
         url (str): The endpoint URL.
         headers (dict): Headers to include in the request.
-        data (dict): Data to send in the request body (for POST/PUT requests).
+        data (dict): Data to send in the request body.
         params (dict): URL query parameters.
         retries (int): Number of retry attempts.
         backoff_factor (float): Backoff factor for exponential delays.
@@ -1680,7 +1713,7 @@ def makeRequestWithRetry(method, url,
             total=retries,
             backoff_factor=backoff_factor,
             status_forcelist=[429, 500, 502, 503, 504],
-            allowed_methods={'DELETE', 'GET', 'POST', 'PUT'}
+            allowed_methods={'DELETE', 'GET', 'POST', 'PUT', 'PATCH'}
         )
 
         adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy)
@@ -1690,7 +1723,8 @@ def makeRequestWithRetry(method, url,
                                        url,
                                        headers=headers,
                                        data=data,
-                                       params=params)
+                                       params=params,
+                                       files=files)
         response.raise_for_status()
         return response
             
diff --git a/utilsChecker.py b/utilsChecker.py
index ee8a390a..41f8f931 100644
--- a/utilsChecker.py
+++ b/utilsChecker.py
@@ -24,6 +24,7 @@
 from utilsCameraPy3 import Camera, nview_linear_triangulations
 from utils import getOpenPoseMarkerNames, getOpenPoseFaceMarkers
 from utils import numpy2TRC, rewriteVideos, delete_multiple_element,loadCameraParameters
+from utils import makeRequestWithRetry
 from utilsAPI import getAPIURL
 
 from utilsAuth import getToken
@@ -198,8 +199,9 @@ def computeAverageIntrinsics(session_path,trialIDs,CheckerBoardParams,nImages=25
     camModels = []
     
     for trial_id in trialIDs:
-        resp = requests.get(API_URL + "trials/{}/".format(trial_id),
-                         headers = {"Authorization": "Token {}".format(API_TOKEN)})
+        resp = makeRequestWithRetry('GET',
+                                    API_URL + "trials/{}/".format(trial_id),
+                                    headers = {"Authorization": "Token {}".format(API_TOKEN)})
         trial = resp.json()
         camModels.append(trial['videos'][0]['parameters']['model'])
         trial_name = trial['name']
diff --git a/utilsServer.py b/utilsServer.py
index f97877c5..17d5da51 100644
--- a/utilsServer.py
+++ b/utilsServer.py
@@ -25,6 +25,7 @@
 from utils import importMetadata
 from utils import checkAndGetPosePickles
 from utils import getTrialNameIdMapping
+from utils import makeRequestWithRetry
 from utilsAuth import getToken
 from utilsAPI import getAPIURL
 
@@ -67,8 +68,10 @@ def processTrial(session_id, trial_id, trial_type = 'dynamic',
             error_msg = {}
             error_msg['error_msg'] = e.args[0]
             error_msg['error_msg_dev'] = e.args[1]
-            _ = requests.patch(trial_url, data={"meta": json.dumps(error_msg)},
-                   headers = {"Authorization": "Token {}".format(API_TOKEN)})   
+            _ = makeRequestWithRetry('PATCH',
+                                     trial_url,
+                                     data={"meta": json.dumps(error_msg)},
+                                     headers = {"Authorization": "Token {}".format(API_TOKEN)}) 
             raise Exception('Calibration failed', e.args[0], e.args[1])
         
         if not hasWritePermissions:
@@ -143,8 +146,10 @@ def processTrial(session_id, trial_id, trial_type = 'dynamic',
             error_msg = {}
             error_msg['error_msg'] = e.args[0]
             error_msg['error_msg_dev'] = e.args[1]
-            _ = requests.patch(trial_url, data={"meta": json.dumps(error_msg)},
-                   headers = {"Authorization": "Token {}".format(API_TOKEN)})
+            _ = makeRequestWithRetry('PATCH',
+                                     trial_url,
+                                     data={"meta": json.dumps(error_msg)},
+                                     headers = {"Authorization": "Token {}".format(API_TOKEN)})
             raise Exception('Static trial failed', e.args[0], e.args[1])
         
         if not hasWritePermissions:
@@ -233,8 +238,10 @@ def processTrial(session_id, trial_id, trial_type = 'dynamic',
             error_msg = {}
             error_msg['error_msg'] = e.args[0]
             error_msg['error_msg_dev'] = e.args[1]
-            _ = requests.patch(trial_url, data={"meta": json.dumps(error_msg)},
-                   headers = {"Authorization": "Token {}".format(API_TOKEN)})   
+            _ = makeRequestWithRetry('PATCH',
+                                     trial_url,
+                                     data={"meta": json.dumps(error_msg)},
+                                     headers = {"Authorization": "Token {}".format(API_TOKEN)})
             raise Exception('Dynamic trial failed.\n' + error_msg['error_msg_dev'], e.args[0], e.args[1])
         
         if not hasWritePermissions:
@@ -352,8 +359,10 @@ def batchReprocess(session_ids,calib_id,static_id,dynamic_trialNames,poseDetecto
         print('Processing ' + session_id)
         
         # check if write permissions (session owner or admin)
-        permissions = requests.get(API_URL + "sessions/{}/get_session_permission/".format(session_id),
-                     headers = {"Authorization": "Token {}".format(API_TOKEN)}).json()
+        response = makeRequestWithRetry('GET',
+                                        API_URL + "sessions/{}/get_session_permission/".format(session_id),
+                                        headers = {"Authorization": "Token {}".format(API_TOKEN)})
+        permissions = response.json()
         hasWritePermissions = permissions['isAdmin'] or permissions['isOwner']
 
 
@@ -373,13 +382,17 @@ def batchReprocess(session_ids,calib_id,static_id,dynamic_trialNames,poseDetecto
                               hasWritePermissions = hasWritePermissions,
                               cameras_to_use=cameras_to_use)
                 statusData = {'status':'done'}
-                _ = requests.patch(API_URL + "trials/{}/".format(calib_id_toProcess), data=statusData,
-                         headers = {"Authorization": "Token {}".format(API_TOKEN)})
+                _ = makeRequestWithRetry('PATCH',
+                                         API_URL + "trials/{}/".format(calib_id_toProcess),
+                                         data=statusData,
+                                         headers = {"Authorization": "Token {}".format(API_TOKEN)})
             except Exception as e:
                 print(e)
                 statusData = {'status':'error'}
-                _ = requests.patch(API_URL + "trials/{}/".format(calib_id_toProcess), data=statusData,
-                         headers = {"Authorization": "Token {}".format(API_TOKEN)})
+                _ = makeRequestWithRetry('PATCH',
+                                         API_URL + "trials/{}/".format(calib_id_toProcess),
+                                         data=statusData,
+                                         headers = {"Authorization": "Token {}".format(API_TOKEN)})
         
         if static_id == None:
             static_id_toProcess = getNeutralTrialID(session_id)
@@ -400,17 +413,22 @@ def batchReprocess(session_ids,calib_id,static_id,dynamic_trialNames,poseDetecto
                               batchProcess = True,
                               cameras_to_use=cameras_to_use)
                 statusData = {'status':'done'}
-                _ = requests.patch(API_URL + "trials/{}/".format(static_id_toProcess), data=statusData,
-                         headers = {"Authorization": "Token {}".format(API_TOKEN)})
+                _ = makeRequestWithRetry('PATCH',
+                                         API_URL + "trials/{}/".format(static_id_toProcess),
+                                         data=statusData,
+                                         headers = {"Authorization": "Token {}".format(API_TOKEN)})
             except Exception as e:
                 print(e)
                 statusData = {'status':'error'}
-                _ = requests.patch(API_URL + "trials/{}/".format(static_id_toProcess), data=statusData,
-                         headers = {"Authorization": "Token {}".format(API_TOKEN)})
+                _ = makeRequestWithRetry('PATCH',
+                                         API_URL + "trials/{}/".format(static_id_toProcess),
+                                         data=statusData,
+                                         headers = {"Authorization": "Token {}".format(API_TOKEN)})
         if dynamic_ids == None:
-
-            session = requests.get(API_URL + "sessions/{}/".format(session_id),
-                                   headers = {"Authorization": "Token {}".format(API_TOKEN)}).json()
+            response = makeRequestWithRetry('GET',
+                                            API_URL + "sessions/{}/".format(session_id),
+                                            headers = {"Authorization": "Token {}".format(API_TOKEN)})
+            session = response.json()
             dynamic_ids_toProcess = [t['id'] for t in session['trials'] if (t['name'] != 'calibration' and t['name'] !='neutral')]
         else:
             if type(dynamic_ids) == str:
@@ -433,13 +451,17 @@ def batchReprocess(session_ids,calib_id,static_id,dynamic_trialNames,poseDetecto
                           cameras_to_use=cameras_to_use)
                 
                 statusData = {'status':'done'}
-                _ = requests.patch(API_URL + "trials/{}/".format(dID), data=statusData,
-                         headers = {"Authorization": "Token {}".format(API_TOKEN)})
+                _ = makeRequestWithRetry('PATCH',
+                                         API_URL + "trials/{}/".format(dID),
+                                         data=statusData,
+                                         headers = {"Authorization": "Token {}".format(API_TOKEN)})
             except Exception as e:
                 print(e)
                 statusData = {'status':'error'}
-                _ = requests.patch(API_URL + "trials/{}/".format(dID), data=statusData,
-                         headers = {"Authorization": "Token {}".format(API_TOKEN)})
+                _ = makeRequestWithRetry('PATCH',
+                                         API_URL + "trials/{}/".format(dID),
+                                         data=statusData,
+                                         headers = {"Authorization": "Token {}".format(API_TOKEN)})
 
 def runTestSession(pose='all',isDocker=True):
     trials = {}

From 57e14f41658b5027cd65707bf543aa644a98e947 Mon Sep 17 00:00:00 2001
From: carmichaelong <carmichael.ong@gmail.com>
Date: Fri, 22 Nov 2024 15:10:58 -0800
Subject: [PATCH 07/19] do not use retries for dequeue loop in app

---
 app.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/app.py b/app.py
index 5b7d26c4..9bceb8ee 100644
--- a/app.py
+++ b/app.py
@@ -67,9 +67,8 @@
     # no query string -> defaults to 'all'
     queue_path = "trials/dequeue/?workerType=" + workerType
     try:
-        r = makeRequestWithRetry('GET',
-                                 "{}{}".format(API_URL, queue_path),
-                                 headers = {"Authorization": "Token {}".format(API_TOKEN)})
+        r = requests.get("{}{}".format(API_URL, queue_path),
+                         headers = {"Authorization": "Token {}".format(API_TOKEN)})
     except Exception as e:
         traceback.print_exc()
         time.sleep(15)
@@ -189,4 +188,4 @@
         folders = glob.glob(os.path.join(getDataDirectory(isDocker=True),'Data','*'))
         for f in folders:         
             shutil.rmtree(f)
-            logging.info('deleting ' + f)
\ No newline at end of file
+            logging.info('deleting ' + f)

From 4bfcf1c57831c08bf82d3f87657fd71bc5369150 Mon Sep 17 00:00:00 2001
From: Alberto Casas Ortiz <albertocasasortiz@gmail.com>
Date: Tue, 10 Dec 2024 14:27:50 -0800
Subject: [PATCH 08/19] Intercepting error of human detection pose not detected
 in hrnet.

---
 utilsDetector.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/utilsDetector.py b/utilsDetector.py
index 1e57ddef..31557464 100644
--- a/utilsDetector.py
+++ b/utilsDetector.py
@@ -319,9 +319,13 @@ def runMMposeVideo(
                       
                 # copy /data/output to pathOutputPkl
                 os.system("cp /data/output_mmpose/* {pathOutputPkl}/".format(pathOutputPkl=pathOutputPkl))
-                pkl_path_tmp = os.path.join(pathOutputPkl, 'human.pkl')            
-                os.rename(pkl_path_tmp, pklPath)
-            
+                pkl_path_tmp = os.path.join(pathOutputPkl, 'human.pkl')
+                if os.path.exists(pkl_path_tmp):
+                    os.rename(pkl_path_tmp, pklPath)
+                else:
+                    raise FileNotFoundError(
+                        "We could not detect any pose in your video. Please verify that the subject is correctly in front of the camera."
+                    )
             except Exception as e:
                 if len(e.args) == 2: # specific exception
                     raise Exception(e.args[0], e.args[1])

From 8c1459ca3dac202311ac50654c3d7b73c0e1acc2 Mon Sep 17 00:00:00 2001
From: Antoine Falisse <antoinefalisse@gmail.com>
Date: Thu, 12 Dec 2024 16:30:19 +0100
Subject: [PATCH 09/19] support for lying checkerboard

---
 main.py | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/main.py b/main.py
index aba3d625..678fc630 100644
--- a/main.py
+++ b/main.py
@@ -303,26 +303,18 @@ def main(sessionName, trialName, trial_id, cameras_to_use=['all'],
     trialRelativePath = os.path.join('InputMedia', trialName, trial_id)
     
     if runPoseDetection:
-        # Detect if checkerboard is upside down.
-        upsideDownChecker = isCheckerboardUpsideDown(CamParamDict)
         # Get rotation angles from motion capture environment to OpenSim.
         # Space-fixed are lowercase, Body-fixed are uppercase. 
         checkerBoardMount = sessionMetadata['checkerBoard']['placement']
-        if checkerBoardMount == 'backWall' and not upsideDownChecker:
-            rotationAngles = {'y':90, 'z':180}
-        elif checkerBoardMount == 'backWall' and upsideDownChecker:
-            rotationAngles = {'y':-90}
-        elif checkerBoardMount == 'backWall_largeCB':
-            rotationAngles = {'y':-90}
-        # TODO: uppercase?
-        elif checkerBoardMount == 'backWall_walking':
-            rotationAngles = {'YZ':(-90,180)}
-        elif checkerBoardMount == 'ground':
-            rotationAngles = {'x':-90, 'y':90}
-        elif checkerBoardMount == 'ground_jumps': # for sub1
-            rotationAngles = {'x':90, 'y':180}
-        elif checkerBoardMount == 'ground_gaits': # for sub1
-            rotationAngles = {'x':90, 'y':90}        
+        if checkerBoardMount == 'backWall' or checkerBoardMount == 'Perpendicular':
+            # Detect if checkerboard is upside down.
+            upsideDownChecker = isCheckerboardUpsideDown(CamParamDict)
+            if upsideDownChecker:
+                rotationAngles = {'y':-90}
+            else:
+                rotationAngles = {'y':90, 'z':180}
+        elif checkerBoardMount == 'ground' or checkerBoardMount == 'Lying':
+            rotationAngles = {'x':90, 'y':90}
         else:
             raise Exception('checkerBoard placement value in\
              sessionMetadata.yaml is not currently supported')

From 0cb03721da28647951dbfec93b83647d2591e514 Mon Sep 17 00:00:00 2001
From: Alberto Casas Ortiz <albertocasasortiz@gmail.com>
Date: Tue, 14 Jan 2025 16:10:12 -0800
Subject: [PATCH 10/19] Allowing running multiple instances of opencap in one
 server.

---
 docker/Makefile | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/docker/Makefile b/docker/Makefile
index fda79023..9c2e71e8 100644
--- a/docker/Makefile
+++ b/docker/Makefile
@@ -68,12 +68,9 @@ endif
 
 .PHONY: run
 run:
-ifeq ($(CURRENT_BRANCH),$(PROD_BRANCH))
-	aws ecr get-login-password --region us-west-2 --profile opencap | docker login --username AWS --password-stdin 660440363484.dkr.ecr.us-west-2.amazonaws.com
-	
-else ifeq ($(CURRENT_BRANCH),$(DEV_BRANCH))
-	aws ecr get-login-password --region us-west-2 --profile opencap | docker login --username AWS --password-stdin 660440363484.dkr.ecr.us-west-2.amazonaws.com
-
-endif
-
-	OPENCAP_IMAGE_NAME=$(OPENCAP_IMAGE_NAME) OPENPOSE_IMAGE_NAME=$(OPENPOSE_IMAGE_NAME) MMPOSE_IMAGE_NAME=$(MMPOSE_IMAGE_NAME) docker-compose up
\ No newline at end of file
+	@if [ -z "$(INSTANCE_ID)" ]; then echo "INSTANCE_ID is required. Use make run INSTANCE_ID=<unique_id>"; exit 1; fi
+	COMPOSE_PROJECT_NAME=opencap_$(INSTANCE_ID) \
+	OPENCAP_IMAGE_NAME=$(OPENCAP_IMAGE_NAME) \
+	OPENPOSE_IMAGE_NAME=$(OPENPOSE_IMAGE_NAME) \
+	MMPOSE_IMAGE_NAME=$(MMPOSE_IMAGE_NAME) \
+	docker-compose up

From f807e2583c5f4ee2cd9414ab06b7254c28f806d7 Mon Sep 17 00:00:00 2001
From: Alberto Casas Ortiz <albertocasasortiz@gmail.com>
Date: Tue, 21 Jan 2025 11:13:54 -0800
Subject: [PATCH 11/19] Modified Makefile and docker-compose to allow multiple
 instances in same machine.

---
 docker/Makefile            | 13 +++++++--
 docker/docker-compose.yaml |  9 ++++++
 docker/start-script.sh     | 58 ++++++++++++++++++++++++++++++++++++++
 docker/stop-all-script.sh  |  1 +
 docker/stop-script.sh      | 25 ++++++++++++++++
 5 files changed, 104 insertions(+), 2 deletions(-)
 create mode 100755 docker/start-script.sh
 create mode 100755 docker/stop-all-script.sh
 create mode 100755 docker/stop-script.sh

diff --git a/docker/Makefile b/docker/Makefile
index 9c2e71e8..826d30b8 100644
--- a/docker/Makefile
+++ b/docker/Makefile
@@ -68,9 +68,18 @@ endif
 
 .PHONY: run
 run:
-	@if [ -z "$(INSTANCE_ID)" ]; then echo "INSTANCE_ID is required. Use make run INSTANCE_ID=<unique_id>"; exit 1; fi
+	@if [ -z "$(INSTANCE_ID)" ]; then \
+		echo "INSTANCE_ID is required. Use make run INSTANCE_ID=<unique_id> CPU_SET=<cpu_set>"; \
+		exit 1; \
+	fi
+	@if [ -z "$(CPU_SET)" ]; then \
+		echo "CPU_SET is required. Use make run INSTANCE_ID=<unique_id> CPU_SET=<cpu_set>"; \
+		exit 1; \
+	fi
 	COMPOSE_PROJECT_NAME=opencap_$(INSTANCE_ID) \
 	OPENCAP_IMAGE_NAME=$(OPENCAP_IMAGE_NAME) \
 	OPENPOSE_IMAGE_NAME=$(OPENPOSE_IMAGE_NAME) \
 	MMPOSE_IMAGE_NAME=$(MMPOSE_IMAGE_NAME) \
-	docker-compose up
+	INSTANCE_ID=$(INSTANCE_ID) \
+	CPU_SET=$(CPU_SET) \
+	docker compose up -d
diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
index 169ff406..a78264d8 100644
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@@ -8,6 +8,7 @@ services:
       - ../.env
     environment:
       - DOCKERCOMPOSE=1
+      - NVIDIA_VISIBLE_DEVICES=${INSTANCE_ID}
     deploy:
       resources:
         reservations:
@@ -15,10 +16,13 @@ services:
             - driver: nvidia
               count: 1
               capabilities: [gpu]
+    cpuset: "${CPU_SET}"
   openpose:
     image: ${OPENPOSE_IMAGE_NAME}
     volumes:
       - data:/openpose/data
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=${INSTANCE_ID}
     deploy:
       resources:
         reservations:
@@ -26,10 +30,13 @@ services:
             - driver: nvidia
               count: 1
               capabilities: [gpu]
+    cpuset: "${CPU_SET}"
   mmpose:
     image: ${MMPOSE_IMAGE_NAME}
     volumes:
       - data:/mmpose/data
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=${INSTANCE_ID}
     deploy:
       resources:
         reservations:
@@ -37,5 +44,7 @@ services:
             - driver: nvidia
               count: 1
               capabilities: [gpu]
+    cpuset: "${CPU_SET}"
+
 volumes:
   data: {}
diff --git a/docker/start-script.sh b/docker/start-script.sh
new file mode 100755
index 00000000..c677a007
--- /dev/null
+++ b/docker/start-script.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# Configuration
+MAX_INSTANCES=8
+CPUS_PER_INSTANCE=14
+GPUS_PER_INSTANCE=1
+
+# Get the total number of CPUs and GPUs available
+TOTAL_CPUS=$(nproc)
+TOTAL_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+
+# Read number of instances to start
+if [ -z "$1" ]; then
+  echo "Usage: $0 <number_of_instances>"
+  echo "Provide the number of instances to start (max $MAX_INSTANCES)."
+  exit 1
+fi
+
+NUM_INSTANCES=$1
+
+# Validate the number of instances
+if (( NUM_INSTANCES > MAX_INSTANCES )); then
+  echo "Error: Maximum number of instances is $MAX_INSTANCES."
+  exit 1
+fi
+
+# Check if there are enough resources
+if (( NUM_INSTANCES * CPUS_PER_INSTANCE > TOTAL_CPUS )); then
+  echo "Error: Not enough CPUs. Required: $((NUM_INSTANCES * CPUS_PER_INSTANCE)), Available: $TOTAL_CPUS."
+  exit 1
+fi
+
+if (( NUM_INSTANCES * GPUS_PER_INSTANCE > TOTAL_GPUS )); then
+  echo "Error: Not enough GPUs. Required: $((NUM_INSTANCES * GPUS_PER_INSTANCE)), Available: $TOTAL_GPUS."
+  exit 1
+fi
+
+# Display summary
+echo "Starting $NUM_INSTANCES instances..."
+echo "Total CPUs: $TOTAL_CPUS (using $CPUS_PER_INSTANCE per instance)"
+echo "Total GPUs: $TOTAL_GPUS (using $GPUS_PER_INSTANCE per instance)"
+echo
+
+# Start instances
+for (( i=0; i<NUM_INSTANCES; i++ )); do
+  INSTANCE_ID=$i
+  CPU_START=$(( i * CPUS_PER_INSTANCE ))
+  CPU_END=$(( CPU_START + CPUS_PER_INSTANCE - 1 ))
+  CPU_SET="${CPU_START}-${CPU_END}"
+
+  echo "Starting instance $INSTANCE_ID with CPU_SET=${CPU_SET} and GPU=${INSTANCE_ID}"
+  
+  # Run docker-compose for each instance
+  make run INSTANCE_ID=$INSTANCE_ID CPU_SET=$CPU_SET
+done
+
+echo "All instances started successfully."
+
diff --git a/docker/stop-all-script.sh b/docker/stop-all-script.sh
new file mode 100755
index 00000000..80a866de
--- /dev/null
+++ b/docker/stop-all-script.sh
@@ -0,0 +1 @@
+for i in $(seq 0 8); do     ./stop-script.sh $i; done
diff --git a/docker/stop-script.sh b/docker/stop-script.sh
new file mode 100755
index 00000000..1b4064b6
--- /dev/null
+++ b/docker/stop-script.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Check if INSTANCE_ID is provided
+if [ -z "$1" ]; then
+  echo "Usage: $0 <INSTANCE_ID>"
+  exit 1
+fi
+
+INSTANCE_ID=$1
+COMPOSE_PROJECT_NAME="opencap_${INSTANCE_ID}"
+
+echo "Stopping and removing containers for INSTANCE_ID=${INSTANCE_ID}..."
+
+# Stop and remove containers associated with the project
+docker-compose \
+  --project-name $COMPOSE_PROJECT_NAME \
+  down
+
+# Verify if containers are removed
+if [ $? -eq 0 ]; then
+  echo "Successfully stopped and removed containers for INSTANCE_ID=${INSTANCE_ID}."
+else
+  echo "Failed to stop and remove containers for INSTANCE_ID=${INSTANCE_ID}."
+fi
+

From 98a385db50456a2ed6cd058f42c3dc056b84dbcf Mon Sep 17 00:00:00 2001
From: Alberto Casas Ortiz <albertocasasortiz@gmail.com>
Date: Thu, 23 Jan 2025 12:27:25 -0800
Subject: [PATCH 12/19] Renamed scripts. Added script to stop all.

---
 docker/{start-script.sh => start-containers.sh} | 4 +++-
 docker/stop-all-containers.sh                   | 1 +
 docker/stop-all-script.sh                       | 1 -
 docker/{stop-script.sh => stop-container.sh}    | 0
 4 files changed, 4 insertions(+), 2 deletions(-)
 rename docker/{start-script.sh => start-containers.sh} (99%)
 create mode 100755 docker/stop-all-containers.sh
 delete mode 100755 docker/stop-all-script.sh
 rename docker/{stop-script.sh => stop-container.sh} (100%)

diff --git a/docker/start-script.sh b/docker/start-containers.sh
similarity index 99%
rename from docker/start-script.sh
rename to docker/start-containers.sh
index c677a007..6b53a404 100755
--- a/docker/start-script.sh
+++ b/docker/start-containers.sh
@@ -49,9 +49,11 @@ for (( i=0; i<NUM_INSTANCES; i++ )); do
   CPU_SET="${CPU_START}-${CPU_END}"
 
   echo "Starting instance $INSTANCE_ID with CPU_SET=${CPU_SET} and GPU=${INSTANCE_ID}"
-  
+
   # Run docker-compose for each instance
   make run INSTANCE_ID=$INSTANCE_ID CPU_SET=$CPU_SET
+
+  sleep 10
 done
 
 echo "All instances started successfully."
diff --git a/docker/stop-all-containers.sh b/docker/stop-all-containers.sh
new file mode 100755
index 00000000..eab78ea0
--- /dev/null
+++ b/docker/stop-all-containers.sh
@@ -0,0 +1 @@
+for i in $(seq 0 8); do     ./stop-container.sh $i; done
diff --git a/docker/stop-all-script.sh b/docker/stop-all-script.sh
deleted file mode 100755
index 80a866de..00000000
--- a/docker/stop-all-script.sh
+++ /dev/null
@@ -1 +0,0 @@
-for i in $(seq 0 8); do     ./stop-script.sh $i; done
diff --git a/docker/stop-script.sh b/docker/stop-container.sh
similarity index 100%
rename from docker/stop-script.sh
rename to docker/stop-container.sh

From 26fa70948bbe63ce2880f0ff5d5c0e767bed8945 Mon Sep 17 00:00:00 2001
From: Alberto Casas Ortiz <albertocasasortiz@gmail.com>
Date: Tue, 28 Jan 2025 11:12:01 -0800
Subject: [PATCH 13/19] Added script to check container status and start single
 containers.

---
 docker/check-containers-health.sh | 34 +++++++++++++++++++++
 docker/start-container.sh         | 51 +++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)
 create mode 100755 docker/check-containers-health.sh
 create mode 100755 docker/start-container.sh

diff --git a/docker/check-containers-health.sh b/docker/check-containers-health.sh
new file mode 100755
index 00000000..e87cfc0c
--- /dev/null
+++ b/docker/check-containers-health.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Function to check if a container is running
+is_container_alive() {
+  local container_name=$1
+  docker ps --filter "name=^/${container_name}$" --filter "status=running" --format '{{.Names}}' | grep -wq "$container_name"
+  return $?
+}
+
+# Loop through numbers 0 to 7
+for n in {0..7}; do
+  # Container names
+  opencap_openpose="opencap_${n}-openpose-1"
+  opencap_mmpose="opencap_${n}-mmpose-1"
+  opencap_mobilecap="opencap_${n}-mobilecap-1"
+
+  # Check if all three containers are alive
+  if is_container_alive "$opencap_openpose" && \
+     is_container_alive "$opencap_mmpose" && \
+     is_container_alive "$opencap_mobilecap"; then
+    echo "All containers for instance $n are alive. Skipping."
+    continue
+  fi
+
+  # Check if any container exists
+  if docker ps -a --filter "name=^/opencap_${n}-(openpose|mmpose|mobilecap)-1$" --format '{{.Names}}' | grep -q "opencap_${n}"; then
+    echo "Some containers for instance $n are not alive. Stopping instance."
+    ./stop-container.sh "$n"
+    ./start-container.sh "$n"
+  else
+    echo "No containers for instance $n. Skipping."
+  fi
+
+done
diff --git a/docker/start-container.sh b/docker/start-container.sh
new file mode 100755
index 00000000..18a3dd36
--- /dev/null
+++ b/docker/start-container.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Configuration
+MAX_INSTANCES=8
+CPUS_PER_INSTANCE=14
+GPUS_PER_INSTANCE=1
+
+# Get the total number of CPUs and GPUs available
+TOTAL_CPUS=$(nproc)
+TOTAL_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+
+# Check if an instance number is provided
+if [ -z "$1" ]; then
+  echo "Usage: $0 <instance_number>"
+  echo "Provide the instance number to start (0 to $((MAX_INSTANCES - 1)))."
+  exit 1
+fi
+
+INSTANCE_NUMBER=$1
+
+# Validate the instance number
+if (( INSTANCE_NUMBER < 0 || INSTANCE_NUMBER >= MAX_INSTANCES )); then
+  echo "Error: Instance number must be between 0 and $((MAX_INSTANCES - 1))."
+  exit 1
+fi
+
+# Compute CPU and GPU offsets for the selected instance
+CPU_START=$(( INSTANCE_NUMBER * CPUS_PER_INSTANCE ))
+CPU_END=$(( CPU_START + CPUS_PER_INSTANCE - 1 ))
+CPU_SET="${CPU_START}-${CPU_END}"
+
+# Validate resource availability
+if (( CPU_START + CPUS_PER_INSTANCE > TOTAL_CPUS )); then
+  echo "Error: Not enough CPUs available for instance $INSTANCE_NUMBER."
+  exit 1
+fi
+
+if (( INSTANCE_NUMBER >= TOTAL_GPUS )); then
+  echo "Error: Not enough GPUs available for instance $INSTANCE_NUMBER."
+  exit 1
+fi
+
+# Start the specific instance
+echo "Starting instance $INSTANCE_NUMBER with CPU_SET=${CPU_SET} and GPU=${INSTANCE_NUMBER}"
+
+# Run docker-compose for the specific instance
+make run INSTANCE_ID=$INSTANCE_NUMBER CPU_SET=$CPU_SET
+
+sleep 10
+
+echo "Instance $INSTANCE_NUMBER started successfully."

From 666e1654bca971b90709cf36c5d82470690d4d7c Mon Sep 17 00:00:00 2001
From: Alberto Casas Ortiz <albertocasasortiz@gmail.com>
Date: Tue, 28 Jan 2025 15:52:07 -0800
Subject: [PATCH 14/19] Added ugly logging to docker containers. Added try
 catch to finally in app.py

---
 app.py                     |  7 +++++--
 docker/docker-compose.yaml | 15 +++++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/app.py b/app.py
index 9bceb8ee..42dfd813 100644
--- a/app.py
+++ b/app.py
@@ -178,8 +178,11 @@
     
     finally:
         # End process duration timer and post duration to database
-        process_end_time = datetime.now()
-        postProcessedDuration(trial_url, process_end_time - process_start_time)
+        try:
+            process_end_time = datetime.now()
+            postProcessedDuration(trial_url, process_end_time - process_start_time)
+        except Exception as e:
+            traceback.print_exc()
 
     justProcessed = True
     
diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
index a78264d8..4b48f969 100644
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@@ -17,6 +17,11 @@ services:
               count: 1
               capabilities: [gpu]
     cpuset: "${CPU_SET}"
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "100m"  # Rotate when the log reaches 10MB
+        max-file: "7"    # Keep the last 7 log files
   openpose:
     image: ${OPENPOSE_IMAGE_NAME}
     volumes:
@@ -31,6 +36,11 @@ services:
               count: 1
               capabilities: [gpu]
     cpuset: "${CPU_SET}"
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "100m"  # Rotate when the log reaches 10MB
+        max-file: "7"    # Keep the last 7 log files
   mmpose:
     image: ${MMPOSE_IMAGE_NAME}
     volumes:
@@ -45,6 +55,11 @@ services:
               count: 1
               capabilities: [gpu]
     cpuset: "${CPU_SET}"
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "100m"  # Rotate when the log reaches 10MB
+        max-file: "7"    # Keep the last 7 log files
 
 volumes:
   data: {}

From f42a6026fb2597ec59b013c3d5c94ca8194f1f1c Mon Sep 17 00:00:00 2001
From: Alberto Casas Ortiz <albertocasasortiz@gmail.com>
Date: Thu, 30 Jan 2025 14:48:11 -0800
Subject: [PATCH 15/19] catch exception if patch fails in except block

---
 app.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/app.py b/app.py
index 42dfd813..2226cceb 100644
--- a/app.py
+++ b/app.py
@@ -160,10 +160,13 @@
         time.sleep(0.5)
 
     except Exception as e:
-        r = makeRequestWithRetry('PATCH',
-                                 trial_url, data={"status": "error"},
-                                 headers = {"Authorization": "Token {}".format(API_TOKEN)})
-        traceback.print_exc()
+        try:
+            r = makeRequestWithRetry('PATCH',
+                                     trial_url, data={"status": "error"},
+                                     headers = {"Authorization": "Token {}".format(API_TOKEN)})
+            traceback.print_exc()
+        except:
+            traceback.print_exc()
 
         # Antoine: Removing this, it is too often causing the machines to stop. Not because
         # the machines are failing, but because for instance the video is very long with a lot

From b14f5bc38b0a77386bf8dbf58fb187e661f47fa4 Mon Sep 17 00:00:00 2001
From: carmichaelong <carmichael.ong@gmail.com>
Date: Mon, 3 Feb 2025 18:08:14 -0800
Subject: [PATCH 16/19] use different gpu for each instance

---
 docker/docker-compose.yaml    | 11 +++--------
 docker/stop-all-containers.sh |  2 +-
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
index 4b48f969..55acdbdc 100644
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@@ -8,14 +8,13 @@ services:
       - ../.env
     environment:
       - DOCKERCOMPOSE=1
-      - NVIDIA_VISIBLE_DEVICES=${INSTANCE_ID}
     deploy:
       resources:
         reservations:
           devices:
             - driver: nvidia
-              count: 1
               capabilities: [gpu]
+              device_ids: ["${INSTANCE_ID}"]
     cpuset: "${CPU_SET}"
     logging:
       driver: "json-file"
@@ -26,15 +25,13 @@ services:
     image: ${OPENPOSE_IMAGE_NAME}
     volumes:
       - data:/openpose/data
-    environment:
-      - NVIDIA_VISIBLE_DEVICES=${INSTANCE_ID}
     deploy:
       resources:
         reservations:
           devices:
             - driver: nvidia
-              count: 1
               capabilities: [gpu]
+              device_ids: ["${INSTANCE_ID}"]
     cpuset: "${CPU_SET}"
     logging:
       driver: "json-file"
@@ -45,15 +42,13 @@ services:
     image: ${MMPOSE_IMAGE_NAME}
     volumes:
       - data:/mmpose/data
-    environment:
-      - NVIDIA_VISIBLE_DEVICES=${INSTANCE_ID}
     deploy:
       resources:
         reservations:
           devices:
             - driver: nvidia
-              count: 1
               capabilities: [gpu]
+              device_ids: ["${INSTANCE_ID}"]
     cpuset: "${CPU_SET}"
     logging:
       driver: "json-file"
diff --git a/docker/stop-all-containers.sh b/docker/stop-all-containers.sh
index eab78ea0..21c4441b 100755
--- a/docker/stop-all-containers.sh
+++ b/docker/stop-all-containers.sh
@@ -1 +1 @@
-for i in $(seq 0 8); do     ./stop-container.sh $i; done
+for i in $(seq 0 7); do     ./stop-container.sh $i; done

From 638807e0e3e5f206c9611d2ba5ddaeddbd05fc21 Mon Sep 17 00:00:00 2001
From: carmichaelong <carmichael.ong@gmail.com>
Date: Tue, 4 Feb 2025 17:25:05 -0800
Subject: [PATCH 17/19] add error logging json file (optional) for machines
 running app.py

---
 app.py      | 24 ++++++++++++++++++++++--
 utils.py    | 30 ++++++++++++++++++++++++++++++
 utilsAPI.py |  3 +++
 3 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/app.py b/app.py
index 2226cceb..a1ce13a9 100644
--- a/app.py
+++ b/app.py
@@ -9,12 +9,13 @@
 import glob
 from datetime import datetime, timedelta
 import numpy as np
-from utilsAPI import getAPIURL, getWorkerType, getASInstance, unprotect_current_instance, get_number_of_pending_trials
+from utilsAPI import getAPIURL, getWorkerType, getErrorLogBool, getASInstance, unprotect_current_instance, get_number_of_pending_trials
 from utilsAuth import getToken
 from utils import (getDataDirectory, checkTime, checkResourceUsage,
                   sendStatusEmail, checkForTrialsWithStatus,
                   getCommitHash, getHostname, postLocalClientInfo,
-                  postProcessedDuration, makeRequestWithRetry)
+                  postProcessedDuration, makeRequestWithRetry,
+                  writeToErrorLog)
 
 logging.basicConfig(level=logging.INFO)
 
@@ -24,6 +25,9 @@
 autoScalingInstance = getASInstance()
 logging.info(f"AUTOSCALING TEST INSTANCE: {autoScalingInstance}")
 
+ERROR_LOG = getErrorLogBool()
+error_log_path = "/data/error_log.json"
+
 # if true, will delete entire data directory when finished with a trial
 isDocker = True
 
@@ -165,9 +169,20 @@
                                      trial_url, data={"status": "error"},
                                      headers = {"Authorization": "Token {}".format(API_TOKEN)})
             traceback.print_exc()
+
+            if ERROR_LOG:
+                stack = traceback.format_exc()
+                writeToErrorLog(error_log_path, trial["session"], trial["id"],
+                                e, stack)
+
         except:
             traceback.print_exc()
 
+            if ERROR_LOG:
+                stack = traceback.format_exc()
+                writeToErrorLog(error_log_path, trial["session"], trial["id"],
+                                e, stack)
+
         # Antoine: Removing this, it is too often causing the machines to stop. Not because
         # the machines are failing, but because for instance the video is very long with a lot
         # of people in it. We should not stop the machine for that. Originally the check was
@@ -187,6 +202,11 @@
         except Exception as e:
             traceback.print_exc()
 
+            if ERROR_LOG:
+                stack = traceback.format_exc()
+                writeToErrorLog(error_log_path, trial["session"], trial["id"],
+                                e, stack)
+
     justProcessed = True
     
     # Clean data directory
diff --git a/utils.py b/utils.py
index 2b140eed..45bd613d 100644
--- a/utils.py
+++ b/utils.py
@@ -12,6 +12,7 @@
 import subprocess
 import zipfile
 import time
+import datetime
 
 import numpy as np
 import pandas as pd
@@ -1585,6 +1586,35 @@ def checkCudaTF():
         sendStatusEmail(message=message)
         raise Exception("No GPU detected. Exiting.")
 
+def writeToJsonLog(path, new_dict, max_entries=1000):
+    dir_name = os.path.dirname(path)
+    if not os.path.exists(dir_name):
+        os.makedirs(dir_name)
+
+    if os.path.exists(path):
+        with open(path, 'r') as f:
+            data = json.load(f)
+    else:
+        data = []
+
+    data.append(new_dict)
+
+    while len(data) > max_entries:
+        data.pop(0)
+
+    with open(path, 'w') as f:
+        json.dump(data, f)
+
+def writeToErrorLog(path, session_id, trial_id, error, stack, max_entries=1000):
+    error_entry = {
+        'session_id': session_id,
+        'trial_id': trial_id,
+        'datetime': datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
+        'error': str(error),
+        'stack': stack
+    }
+    writeToJsonLog(path, error_entry, max_entries)
+
 # %% Some functions for loading subject data
 
 def getSubjectNumber(subjectName):
diff --git a/utilsAPI.py b/utilsAPI.py
index 982bf4f7..1535406c 100644
--- a/utilsAPI.py
+++ b/utilsAPI.py
@@ -48,6 +48,9 @@ def getStatusEmails():
     
     return emailInfo
 
+def getErrorLogBool():
+    return config('ERROR_LOG', default=False, cast=bool)
+
 def getASInstance():
     try:
         # Check if the ECS_CONTAINER_METADATA_FILE environment variable exists

From 7d7e215f045138d9b7706cbaf35e2e9c0e9f24e0 Mon Sep 17 00:00:00 2001
From: carmichaelong <carmichael.ong@gmail.com>
Date: Wed, 5 Feb 2025 15:56:36 -0800
Subject: [PATCH 18/19] set defaults for INSTANCE_ID and CPU_SET to restore
 make run usage

---
 docker/Makefile | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/docker/Makefile b/docker/Makefile
index 826d30b8..ff2197b5 100644
--- a/docker/Makefile
+++ b/docker/Makefile
@@ -3,6 +3,10 @@ REPO_NAME := opencap
 PROD_BRANCH := main
 DEV_BRANCH := dev
 
+# Initialize variables if not passed in
+INSTANCE_ID ?= 0
+CPU_SET ?= ""
+
 # Determine the branch name
 CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD)
 
@@ -68,14 +72,9 @@ endif
 
 .PHONY: run
 run:
-	@if [ -z "$(INSTANCE_ID)" ]; then \
-		echo "INSTANCE_ID is required. Use make run INSTANCE_ID=<unique_id> CPU_SET=<cpu_set>"; \
-		exit 1; \
-	fi
-	@if [ -z "$(CPU_SET)" ]; then \
-		echo "CPU_SET is required. Use make run INSTANCE_ID=<unique_id> CPU_SET=<cpu_set>"; \
-		exit 1; \
-	fi
+	@echo "Usage: sudo make run INSTANCE_ID=<unique_id> CPU_SET=<cpu_set>"
+	@echo "Defaults: INSTANCE_ID=0, CPU_SET=\"\""	
+
 	COMPOSE_PROJECT_NAME=opencap_$(INSTANCE_ID) \
 	OPENCAP_IMAGE_NAME=$(OPENCAP_IMAGE_NAME) \
 	OPENPOSE_IMAGE_NAME=$(OPENPOSE_IMAGE_NAME) \

From 116e8d4702876327005be026e6c9206e261ff617 Mon Sep 17 00:00:00 2001
From: carmichaelong <carmichael.ong@gmail.com>
Date: Thu, 6 Feb 2025 16:29:18 -0800
Subject: [PATCH 19/19] retry test session with random waiting time, simplify
 makeRequestWithRetry error handling

---
 docker/start-containers.sh |  2 +-
 utils.py                   | 44 ++++++++-----------
 utilsServer.py             | 90 +++++++++++++++++++++++++-------------
 3 files changed, 80 insertions(+), 56 deletions(-)

diff --git a/docker/start-containers.sh b/docker/start-containers.sh
index 6b53a404..fb281e39 100755
--- a/docker/start-containers.sh
+++ b/docker/start-containers.sh
@@ -53,7 +53,7 @@ for (( i=0; i<NUM_INSTANCES; i++ )); do
   # Run docker-compose for each instance
   make run INSTANCE_ID=$INSTANCE_ID CPU_SET=$CPU_SET
 
-  sleep 10
+  sleep 2
 done
 
 echo "All instances started successfully."
diff --git a/utils.py b/utils.py
index 45bd613d..1c2445f5 100644
--- a/utils.py
+++ b/utils.py
@@ -1747,28 +1747,22 @@ def makeRequestWithRetry(method, url,
     Returns:
         requests.Response: The response object for further processing.
     """
-    try:
-        retry_strategy = Retry(
-            total=retries,
-            backoff_factor=backoff_factor,
-            status_forcelist=[429, 500, 502, 503, 504],
-            allowed_methods={'DELETE', 'GET', 'POST', 'PUT', 'PATCH'}
-        )
-
-        adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy)
-        with requests.Session() as session:
-            session.mount("https://", adapter)
-            response = session.request(method,
-                                       url,
-                                       headers=headers,
-                                       data=data,
-                                       params=params,
-                                       files=files)
-        response.raise_for_status()
-        return response
-            
-    except requests.exceptions.HTTPError as e:
-        raise Exception(f"HTTP error occurred: {e}") 
-    
-    except Exception as e:
-        raise Exception(f"An error occurred: {e}")
+    retry_strategy = Retry(
+        total=retries,
+        backoff_factor=backoff_factor,
+        status_forcelist=[429, 500, 502, 503, 504],
+        allowed_methods={'DELETE', 'GET', 'POST', 'PUT', 'PATCH'}
+    )
+
+    adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy)
+    with requests.Session() as session:
+        session.mount("https://", adapter)
+        response = session.request(method,
+                                    url,
+                                    headers=headers,
+                                    data=data,
+                                    params=params,
+                                    files=files)
+    response.raise_for_status()
+    return response
+
diff --git a/utilsServer.py b/utilsServer.py
index 17d5da51..116c5390 100644
--- a/utilsServer.py
+++ b/utilsServer.py
@@ -4,6 +4,8 @@
 import requests
 import json
 import logging
+import time
+import random
 
 from main import main
 from utils import getDataDirectory
@@ -463,35 +465,63 @@ def batchReprocess(session_ids,calib_id,static_id,dynamic_trialNames,poseDetecto
                                          data=statusData,
                                          headers = {"Authorization": "Token {}".format(API_TOKEN)})
 
-def runTestSession(pose='all',isDocker=True):
-    trials = {}
-    
-    if not any(s in API_URL for s in ['dev.opencap', '127.0']) : # prod trials
-        trials['openpose'] = '3f2960c7-ca29-45b0-9be5-8d74db6131e5' # session ae2d50f1-537a-44f1-96a5-f5b7717452a3 
-        trials['hrnet'] = '299ca938-8765-4a84-9adf-6bdf0e072451' # session faef80d3-0c26-452c-a7be-28dbfe04178e
-        # trials['failure'] = '698162c8-3980-46e5-a3c5-8d4f081db4c4' # failed trial for testing
-    else: # dev trials
-        trials['openpose'] = '89d77579-8371-4760-a019-95f2c793622c' # session acd0e19c-6c86-4ba4-95fd-94b97229a926
-        trials['hrnet'] = 'e0e02393-42ee-46d4-9ae1-a6fbb0b89c42' # session 3510c726-a1b8-4de4-a4a2-52b021b4aab2
-     
-    if pose == 'all':
-        trialList = list(trials.values())
-    else:
-        try: 
-            trialList = [trials[pose]]
-        except:
+def runTestSession(pose='all',isDocker=True,maxNumTries=3):
+    # We retry test sessions because different sometimes when different
+    # containers are processing the test trial, the API can change the
+    # URL, causing 404 errors. 
+    numTries = 0
+    while numTries < maxNumTries:
+        numTries += 1
+        logging.info(f"Starting test trial attempt #{numTries} of {maxNumTries}")
+        trials = {}
+        
+        if not any(s in API_URL for s in ['dev.opencap', '127.0']) : # prod trials
+            trials['openpose'] = '3f2960c7-ca29-45b0-9be5-8d74db6131e5' # session ae2d50f1-537a-44f1-96a5-f5b7717452a3 
+            trials['hrnet'] = '299ca938-8765-4a84-9adf-6bdf0e072451' # session faef80d3-0c26-452c-a7be-28dbfe04178e
+            # trials['failure'] = '698162c8-3980-46e5-a3c5-8d4f081db4c4' # failed trial for testing
+        else: # dev trials
+            trials['openpose'] = '89d77579-8371-4760-a019-95f2c793622c' # session acd0e19c-6c86-4ba4-95fd-94b97229a926
+            trials['hrnet'] = 'e0e02393-42ee-46d4-9ae1-a6fbb0b89c42' # session 3510c726-a1b8-4de4-a4a2-52b021b4aab2
+        
+        if pose == 'all':
             trialList = list(trials.values())
+        else:
+            try: 
+                trialList = [trials[pose]]
+            except:
+                trialList = list(trials.values())
         
-    try: 
-        for trial_id in trialList:
-            trial = getTrialJson(trial_id)
-            logging.info("Running status check on trial name: " + trial['name'] + "_" + str(trial_id) + "\n\n")
-            processTrial(trial["session"], trial_id, trial_type='static', isDocker=isDocker)
-    except:
-        logging.info("test trial failed. stopping machine.")
-        # send email
-        message = "A backend OpenCap machine failed the status check. It has been stopped."
-        sendStatusEmail(message=message)
-        raise Exception('Failed status check. Stopped.')
-        
-    logging.info("\n\n\nStatus check succeeded. \n\n")
+        try: 
+            for trial_id in trialList:
+                trial = getTrialJson(trial_id)
+                logging.info("Running status check on trial name: " + trial['name'] + "_" + str(trial_id) + "\n\n")
+                processTrial(trial["session"], trial_id, trial_type='static', isDocker=isDocker)
+
+            logging.info("\n\n\nStatus check succeeded. \n\n")
+            return
+        
+        # Catch and re-enter while loop if it's an HTTPError (could be more
+        # than just 404 errors). Wait between 30 and 60 seconds before 
+        # retrying.
+        except requests.exceptions.HTTPError as e:
+            if numTries < maxNumTries:
+                logging.info(f"test trial failed on try #{numTries} due to HTTPError. Retrying.")
+                wait_time = random.randint(30,60)
+                logging.info(f"waiting {wait_time} seconds then retrying...")
+                time.sleep(wait_time)
+                continue
+            else:
+                logging.info(f"test trial failed on try #{numTries} due to HTTPError.")
+                # send email
+                message = "A backend OpenCap machine failed the status check (HTTPError). It has been stopped."
+                sendStatusEmail(message=message)
+                raise Exception('Failed status check (HTTPError). Stopped.')
+        
+        # Catch other errors and stop
+        except:
+            logging.info("test trial failed. stopping machine.")
+            # send email
+            message = "A backend OpenCap machine failed the status check. It has been stopped."
+            sendStatusEmail(message=message)
+            raise Exception('Failed status check. Stopped.')
+            
\ No newline at end of file