From 079e31e77ae7d7d47d5c2fe650b116cc83865fd3 Mon Sep 17 00:00:00 2001 From: carmichaelong Date: Tue, 12 Nov 2024 17:15:11 -0800 Subject: [PATCH 01/19] add mapping for genders in metadata --- utils.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/utils.py b/utils.py index 3f85d39c..a5ad223e 100644 --- a/utils.py +++ b/utils.py @@ -360,8 +360,7 @@ def getMetadataFromServer(session_id,justCheckerParams=False): session_desc = importMetadata(defaultMetadataPath) # Get session-specific metadata from api. - - session = getSessionJson(session_id) + session = getSessionJson(session_id) if session['meta'] is not None: if not justCheckerParams: # Backward compatibility @@ -370,7 +369,7 @@ def getMetadataFromServer(session_id,justCheckerParams=False): session_desc["mass_kg"] = float(session['meta']['subject']['mass']) session_desc["height_m"] = float(session['meta']['subject']['height']) if 'gender' in session['meta']['subject']: - session_desc["gender_mf"] = session['meta']['subject']['gender'] + session_desc["gender_mf"] = getGendersDict().get(session['meta']['subject']['gender']) # Before implementing the subject feature, the posemodel was stored # in session['meta']['subject']. After implementing the subject # feature, the posemodel is stored in session['meta']['settings'] @@ -404,7 +403,7 @@ def getMetadataFromServer(session_id,justCheckerParams=False): session_desc["subjectID"] = subject_info['name'] session_desc["mass_kg"] = subject_info['weight'] session_desc["height_m"] = subject_info['height'] - session_desc["gender_mf"] = subject_info['gender'] + session_desc["gender_mf"] = getGendersDict().get(subject_info['gender']) try: session_desc["posemodel"] = session['meta']['settings']['posemodel'] except: @@ -1617,6 +1616,16 @@ def get_entry_with_largest_number(trialList): return max_entry +def getGendersDict(): + genders_dict = { + "woman": "Woman", + "man": "Man", + "transgender": "Transgender", + "non-binary": "Non-Binary/Non-Conforming", + "prefer-not-respond": "Prefer not to respond", + } + return genders_dict + # Get local client info and update def getCommitHash(): From f9e32be2bfaf2ae8e7db6b8c704ee7945733f666 Mon Sep 17 00:00:00 2001 From: carmichaelong Date: Wed, 20 Nov 2024 15:07:01 -0800 Subject: [PATCH 02/19] add makeRequestWithRetry and initial test --- .gitignore | 2 -- utils.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index e94340b0..d1e705b1 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,4 @@ Examples/reprocessDataServer.py *.ini *.stats -tests/ - newsletter.py diff --git a/utils.py b/utils.py index 3f85d39c..e3b56828 100644 --- a/utils.py +++ b/utils.py @@ -16,6 +16,7 @@ import numpy as np import pandas as pd from scipy import signal +from urllib3.util.retry import Retry from utilsAuth import getToken from utilsAPI import getAPIURL @@ -1654,3 +1655,46 @@ def postProcessedDuration(trial_url, duration): headers = {"Authorization": "Token {}".format(API_TOKEN)}) return r + +# utils for common HTTP requests +def makeRequestWithRetry(method, url, + headers=None, data=None, params=None, + retries=5, backoff_factor=1): + """ + Makes an HTTP request with retry logic and returns the Response object. + + Args: + method (str): HTTP method (e.g., 'GET', 'POST', 'PUT', etc.). + url (str): The endpoint URL. + headers (dict): Headers to include in the request. + data (dict): Data to send in the request body (for POST/PUT requests). + params (dict): URL query parameters. + retries (int): Number of retry attempts. + backoff_factor (float): Backoff factor for exponential delays. + + Returns: + requests.Response: The response object for further processing. + """ + try: + retry_strategy = Retry( + total=retries, + backoff_factor=backoff_factor, + status_forcelist=[429, 500, 502, 503, 504] + ) + + adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy) + with requests.Session() as session: + session.mount("https://", adapter) + response = session.request(method=method, + url=url, + headers=headers, + data=data, + params=params) + response.raise_for_status() + return response + + except requests.exceptions.HTTPError as e: + raise Exception(f"HTTP error occurred: {e}") + + except Exception as e: + raise Exception(f"An error occurred: {e}") From e9fceecc5475c03e4c9bb87fca91c505a7f1b019 Mon Sep 17 00:00:00 2001 From: carmichaelong Date: Wed, 20 Nov 2024 15:28:27 -0800 Subject: [PATCH 03/19] add tests folder --- tests/test_api.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 tests/test_api.py diff --git a/tests/test_api.py b/tests/test_api.py new file mode 100644 index 00000000..41a727b2 --- /dev/null +++ b/tests/test_api.py @@ -0,0 +1,9 @@ +from utils import makeRequestWithRetry + +def test_makeRequestWithRetry(self): + test_URL = 'http://httpbin.org/status/404' + + makeRequestWithRetry('GET', test_URL) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From 6ff69d4c6897c33702ba29570bcee1c9e7ce3709 Mon Sep 17 00:00:00 2001 From: carmichaelong Date: Wed, 20 Nov 2024 17:09:45 -0800 Subject: [PATCH 04/19] add pytest and fill out tests for requests retry --- requirements.txt | 3 ++- tests/test_api.py | 26 ++++++++++++++++++++++---- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3d626337..0937e2f6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,5 @@ pingouin==0.5.2 openpyxl ffmpeg-python psutil -boto3 \ No newline at end of file +boto3 +pytest diff --git a/tests/test_api.py b/tests/test_api.py index 41a727b2..24c0f92c 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1,9 +1,27 @@ +import logging +import pytest +import requests + from utils import makeRequestWithRetry -def test_makeRequestWithRetry(self): - test_URL = 'http://httpbin.org/status/404' +class TestMakeRequestWithRetry: + logging.getLogger('urllib3').setLevel(logging.DEBUG) - makeRequestWithRetry('GET', test_URL) + def test_get(self): + # should work and return 200 + response = makeRequestWithRetry('GET', 'https://httpbin.org/get') + assert response.status_code == 200 + assert response.json()['url'] == 'https://httpbin.org/get' + + def test_500(self): + # use an error code that triggers retries (should show up in debug log) + with pytest.raises(Exception): + response_500 = makeRequestWithRetry('GET', 'https://httpbin.org/status/500', retries=3, backoff_factor=0.1) + + def test_404(self): + # error code that won't trigger retries + with pytest.raises(Exception): + response_404 = makeRequestWithRetry('GET', 'https://httpbin.org/status/404', retires=2, backoff_factor=0.01) if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() From 484f2bf61d9b6da5ef4a484a916c26f0fca4c910 Mon Sep 17 00:00:00 2001 From: carmichaelong Date: Fri, 22 Nov 2024 13:23:07 -0800 Subject: [PATCH 05/19] use mock for tests. add post to allowed methods for retry --- tests/test_api.py | 81 +++++++++++++++++++++++++++++++++++++---------- utils.py | 7 ++-- 2 files changed, 68 insertions(+), 20 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index 24c0f92c..232943d6 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1,27 +1,74 @@ import logging import pytest import requests +from unittest.mock import patch, Mock, ANY +from http.client import HTTPMessage from utils import makeRequestWithRetry class TestMakeRequestWithRetry: logging.getLogger('urllib3').setLevel(logging.DEBUG) - def test_get(self): - # should work and return 200 - response = makeRequestWithRetry('GET', 'https://httpbin.org/get') + @patch("requests.Session.request") + def test_get(self, mock_response): + status_code = 200 + mock_response.return_value.status_code = status_code + + response = makeRequestWithRetry('GET', 'https://test.com', retries=2) + assert response.status_code == status_code + mock_response.assert_called_once_with('GET', 'https://test.com', + headers=ANY, data=ANY, params=ANY) + + @patch("requests.Session.request") + def test_put(self, mock_response): + status_code = 201 + mock_response.return_value.status_code = status_code + + data = { + "key1": "value1", + "key2": "value2" + } + + params = { + "param1": "value1" + } + + response = makeRequestWithRetry('POST', + 'https://test.com', + data=data, + headers={"Authorization": "my_token"}, + params=params, + retries=2) + + assert response.status_code == status_code + mock_response.assert_called_once_with('POST', + 'https://test.com', + data=data, + headers={"Authorization": "my_token"}, + params=params) + + @patch("urllib3.connectionpool.HTTPConnectionPool._get_conn") + def test_success_after_retries(self, mock_response): + mock_response.return_value.getresponse.side_effect = [ + Mock(status=500, msg=HTTPMessage()), + Mock(status=502, msg=HTTPMessage()), + Mock(status=200, msg=HTTPMessage()), + Mock(status=429, msg=HTTPMessage()), + ] + + response = makeRequestWithRetry('GET', + 'https://test.com', + retries=5, + backoff_factor=0.1) + assert response.status_code == 200 - assert response.json()['url'] == 'https://httpbin.org/get' - - def test_500(self): - # use an error code that triggers retries (should show up in debug log) - with pytest.raises(Exception): - response_500 = makeRequestWithRetry('GET', 'https://httpbin.org/status/500', retries=3, backoff_factor=0.1) - - def test_404(self): - # error code that won't trigger retries - with pytest.raises(Exception): - response_404 = makeRequestWithRetry('GET', 'https://httpbin.org/status/404', retires=2, backoff_factor=0.01) - -if __name__ == '__main__': - unittest.main() + assert mock_response.call_count == 3 + + # comment out test since httpbin can be unstable and we don't want to rely + # on it for tests. uncomment and see debug log to see retry attempts + '''def test_httpbin(self): + response = makeRequestWithRetry('GET', + 'https://httpbin.org/status/500', + retries=4, + backoff_factor=0.1) + ''' \ No newline at end of file diff --git a/utils.py b/utils.py index e3b56828..0c4ab636 100644 --- a/utils.py +++ b/utils.py @@ -1679,14 +1679,15 @@ def makeRequestWithRetry(method, url, retry_strategy = Retry( total=retries, backoff_factor=backoff_factor, - status_forcelist=[429, 500, 502, 503, 504] + status_forcelist=[429, 500, 502, 503, 504], + allowed_methods={'DELETE', 'GET', 'POST', 'PUT'} ) adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy) with requests.Session() as session: session.mount("https://", adapter) - response = session.request(method=method, - url=url, + response = session.request(method, + url, headers=headers, data=data, params=params) From bda3d82cb2596c7cee0da7aa7e50eeda31041097 Mon Sep 17 00:00:00 2001 From: carmichaelong Date: Fri, 22 Nov 2024 14:46:17 -0800 Subject: [PATCH 06/19] replace requests calls with retry wrapper --- app.py | 25 ++++++++---- tests/test_api.py | 8 +++- utils.py | 102 ++++++++++++++++++++++++++++++---------------- utilsChecker.py | 6 ++- utilsServer.py | 68 ++++++++++++++++++++----------- 5 files changed, 139 insertions(+), 70 deletions(-) diff --git a/app.py b/app.py index 7a636cf7..5b7d26c4 100644 --- a/app.py +++ b/app.py @@ -14,7 +14,7 @@ from utils import (getDataDirectory, checkTime, checkResourceUsage, sendStatusEmail, checkForTrialsWithStatus, getCommitHash, getHostname, postLocalClientInfo, - postProcessedDuration) + postProcessedDuration, makeRequestWithRetry) logging.basicConfig(level=logging.INFO) @@ -67,8 +67,9 @@ # no query string -> defaults to 'all' queue_path = "trials/dequeue/?workerType=" + workerType try: - r = requests.get("{}{}".format(API_URL, queue_path), - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + r = makeRequestWithRetry('GET', + "{}{}".format(API_URL, queue_path), + headers = {"Authorization": "Token {}".format(API_TOKEN)}) except Exception as e: traceback.print_exc() time.sleep(15) @@ -120,8 +121,10 @@ error_msg['error_msg'] = 'No videos uploaded. Ensure phones are connected and you have stable internet connection.' error_msg['error_msg_dev'] = 'No videos uploaded.' - r = requests.patch(trial_url, data={"status": "error", "meta": json.dumps(error_msg)}, - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + r = makeRequestWithRetry('PATCH', + trial_url, + data={"status": "error", "meta": json.dumps(error_msg)}, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) continue # The following is now done in main, to allow reprocessing trials with missing videos @@ -149,14 +152,18 @@ # note a result needs to be posted for the API to know we finished, but we are posting them # automatically thru procesTrial now - r = requests.patch(trial_url, data={"status": "done"}, - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + r = makeRequestWithRetry('PATCH', + trial_url, + data={"status": "done"}, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) + logging.info('0.5s pause if need to restart.') time.sleep(0.5) except Exception as e: - r = requests.patch(trial_url, data={"status": "error"}, - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + r = makeRequestWithRetry('PATCH', + trial_url, data={"status": "error"}, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) traceback.print_exc() # Antoine: Removing this, it is too often causing the machines to stop. Not because diff --git a/tests/test_api.py b/tests/test_api.py index 232943d6..e9e38c02 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -17,7 +17,10 @@ def test_get(self, mock_response): response = makeRequestWithRetry('GET', 'https://test.com', retries=2) assert response.status_code == status_code mock_response.assert_called_once_with('GET', 'https://test.com', - headers=ANY, data=ANY, params=ANY) + headers=None, + data=None, + params=None, + files=None) @patch("requests.Session.request") def test_put(self, mock_response): @@ -45,7 +48,8 @@ def test_put(self, mock_response): 'https://test.com', data=data, headers={"Authorization": "my_token"}, - params=params) + params=params, + files=None) @patch("urllib3.connectionpool.HTTPConnectionPool._get_conn") def test_success_after_retries(self, mock_response): diff --git a/utils.py b/utils.py index 0c4ab636..81be7d96 100644 --- a/utils.py +++ b/utils.py @@ -103,13 +103,17 @@ def download_file(url, file_name): shutil.copyfileobj(response, out_file) def getTrialJson(trial_id): - trialJson = requests.get(API_URL + "trials/{}/".format(trial_id), - headers = {"Authorization": "Token {}".format(API_TOKEN)}).json() + response = makeRequestWithRetry('GET', + API_URL + "trials/{}/".format(trial_id), + headers = {"Authorization": "Token {}".format(API_TOKEN)}) + trialJson = response.json() return trialJson def getSessionJson(session_id): - sessionJson = requests.get(API_URL + "sessions/{}/".format(session_id), - headers = {"Authorization": "Token {}".format(API_TOKEN)}).json() + response = makeRequestWithRetry('GET', + API_URL + "sessions/{}/".format(session_id), + headers = {"Authorization": "Token {}".format(API_TOKEN)}) + sessionJson = response.json() # sort trials by time recorded def getCreatedAt(trial): @@ -119,8 +123,10 @@ def getCreatedAt(trial): return sessionJson def getSubjectJson(subject_id): - subjectJson = requests.get(API_URL + "subjects/{}/".format(subject_id), - headers = {"Authorization": "Token {}".format(API_TOKEN)}).json() + response = makeRequestWithRetry('GET', + API_URL + "subjects/{}/".format(subject_id), + headers = {"Authorization": "Token {}".format(API_TOKEN)}) + subjectJson = response.json() return subjectJson def getTrialName(trial_id): @@ -183,8 +189,10 @@ def postCalibrationOptions(session_path,session_id,overwrite=False): "meta":json.dumps({'calibration':calibOptionsJson}) } trial_url = "{}{}{}/".format(API_URL, "trials/", calibration_id) - r= requests.patch(trial_url, data=data, - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + r = makeRequestWithRetry('PATCH', + trial_url, + data=data, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) if r.status_code == 200: print('Wrote calibration selections to metadata.') @@ -458,8 +466,9 @@ def deleteResult(trial_id, tag=None,resultNum=None): resultNums = [r['id'] for r in trial['results']] for rNum in resultNums: - requests.delete(API_URL + "results/{}/".format(rNum), - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + makeRequestWithRetry('DELETE', + API_URL + "results/{}/".format(rNum), + headers = {"Authorization": "Token {}".format(API_TOKEN)}) def deleteAllResults(session_id): @@ -687,8 +696,10 @@ def changeSessionMetadata(session_ids,newMetaDict): data = {"meta":json.dumps(existingMeta)} - r= requests.patch(session_url, data=data, - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + r = makeRequestWithRetry('PATCH', + session_url, + data=data, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) if r.status_code !=200: print('Changing metadata failed.') @@ -735,9 +746,11 @@ def makeSessionPublic(session_id,publicStatus=True): data = { "public":publicStatus } - - r= requests.patch(session_url, data=data, - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + + r = makeRequestWithRetry('PATCH', + session_url, + data=data, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) if r.status_code == 200: print('Successfully made ' + session_id + ' public.') @@ -861,11 +874,17 @@ def postFileToTrial(filePath,trial_id,tag,device_id): # get S3 link data = {'fileName':os.path.split(filePath)[1]} - r = requests.get(API_URL + "sessions/null/get_presigned_url/",data=data).json() + response = makeRequestWithRetry('GET', + API_URL + "sessions/null/get_presigned_url/", + data=data) + r = response.json() # upload to S3 files = {'file': open(filePath, 'rb')} - requests.post(r['url'], data=r['fields'],files=files) + makeRequestWithRetry('POST', + r['url'], + data=r['fields'], + files=files) files["file"].close() # post link to and data to results @@ -876,8 +895,10 @@ def postFileToTrial(filePath,trial_id,tag,device_id): "media_url" : r['fields']['key'] } - rResult = requests.post(API_URL + "results/", data=data, - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + rResult = makeRequestWithRetry('POST', + API_URL + "results/", + data=data, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) if rResult.status_code != 201: print('server response was + ' + str(r.status_code)) @@ -1484,8 +1505,11 @@ def checkForTrialsWithStatus(status,hours=9999999,relativeTime='newer'): 'justNumber':1, 'relativeTime':relativeTime} - r = requests.get(API_URL+"trials/get_trials_with_status/",params=params, - headers = {"Authorization": "Token {}".format(API_TOKEN)}).json() + response = makeRequestWithRetry('GET', + API_URL+"trials/get_trials_with_status/", + params=params, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) + r = response.json() return r['nTrials'] @@ -1565,8 +1589,10 @@ def checkCudaTF(): # %% Some functions for loading subject data def getSubjectNumber(subjectName): - subjects = requests.get(API_URL + "subjects/", - headers = {"Authorization": "Token {}".format(API_TOKEN)}).json() + response = makeRequestWithRetry('GET', + API_URL + "subjects/", + headers = {"Authorization": "Token {}".format(API_TOKEN)}) + subjects = response.json() sNum = [s['id'] for s in subjects if s['name'] == subjectName] if len(sNum)>1: print(len(sNum) + ' subjects with the name ' + subjectName + '. Will use the first one.') @@ -1576,8 +1602,10 @@ def getSubjectNumber(subjectName): return sNum[0] def getUserSessions(): - sessionJson = requests.get(API_URL + "sessions/valid/", - headers = {"Authorization": "Token {}".format(API_TOKEN)}).json() + response = makeRequestWithRetry('GET', + API_URL + "sessions/valid/", + headers = {"Authorization": "Token {}".format(API_TOKEN)}) + sessionJson = response.json() return sessionJson def getSubjectSessions(subjectName): @@ -1639,8 +1667,10 @@ def postLocalClientInfo(trial_url): "git_commit": getCommitHash(), "hostname": getHostname() } - r = requests.patch(trial_url, data=data, - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + r = makeRequestWithRetry('PATCH', + trial_url, + data=data, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) return r @@ -1651,23 +1681,26 @@ def postProcessedDuration(trial_url, duration): data = { "processed_duration": duration } - r = requests.patch(trial_url, data=data, - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + r = makeRequestWithRetry('PATCH', + trial_url, + data=data, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) return r # utils for common HTTP requests def makeRequestWithRetry(method, url, - headers=None, data=None, params=None, + headers=None, data=None, params=None, files=None, retries=5, backoff_factor=1): """ Makes an HTTP request with retry logic and returns the Response object. Args: - method (str): HTTP method (e.g., 'GET', 'POST', 'PUT', etc.). + method (str): HTTP method (e.g., 'GET', 'POST', 'PUT', etc.) as used in + requests.Session().request() url (str): The endpoint URL. headers (dict): Headers to include in the request. - data (dict): Data to send in the request body (for POST/PUT requests). + data (dict): Data to send in the request body. params (dict): URL query parameters. retries (int): Number of retry attempts. backoff_factor (float): Backoff factor for exponential delays. @@ -1680,7 +1713,7 @@ def makeRequestWithRetry(method, url, total=retries, backoff_factor=backoff_factor, status_forcelist=[429, 500, 502, 503, 504], - allowed_methods={'DELETE', 'GET', 'POST', 'PUT'} + allowed_methods={'DELETE', 'GET', 'POST', 'PUT', 'PATCH'} ) adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy) @@ -1690,7 +1723,8 @@ def makeRequestWithRetry(method, url, url, headers=headers, data=data, - params=params) + params=params, + files=files) response.raise_for_status() return response diff --git a/utilsChecker.py b/utilsChecker.py index ee8a390a..41f8f931 100644 --- a/utilsChecker.py +++ b/utilsChecker.py @@ -24,6 +24,7 @@ from utilsCameraPy3 import Camera, nview_linear_triangulations from utils import getOpenPoseMarkerNames, getOpenPoseFaceMarkers from utils import numpy2TRC, rewriteVideos, delete_multiple_element,loadCameraParameters +from utils import makeRequestWithRetry from utilsAPI import getAPIURL from utilsAuth import getToken @@ -198,8 +199,9 @@ def computeAverageIntrinsics(session_path,trialIDs,CheckerBoardParams,nImages=25 camModels = [] for trial_id in trialIDs: - resp = requests.get(API_URL + "trials/{}/".format(trial_id), - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + resp = makeRequestWithRetry('GET', + API_URL + "trials/{}/".format(trial_id), + headers = {"Authorization": "Token {}".format(API_TOKEN)}) trial = resp.json() camModels.append(trial['videos'][0]['parameters']['model']) trial_name = trial['name'] diff --git a/utilsServer.py b/utilsServer.py index f97877c5..17d5da51 100644 --- a/utilsServer.py +++ b/utilsServer.py @@ -25,6 +25,7 @@ from utils import importMetadata from utils import checkAndGetPosePickles from utils import getTrialNameIdMapping +from utils import makeRequestWithRetry from utilsAuth import getToken from utilsAPI import getAPIURL @@ -67,8 +68,10 @@ def processTrial(session_id, trial_id, trial_type = 'dynamic', error_msg = {} error_msg['error_msg'] = e.args[0] error_msg['error_msg_dev'] = e.args[1] - _ = requests.patch(trial_url, data={"meta": json.dumps(error_msg)}, - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + _ = makeRequestWithRetry('PATCH', + trial_url, + data={"meta": json.dumps(error_msg)}, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) raise Exception('Calibration failed', e.args[0], e.args[1]) if not hasWritePermissions: @@ -143,8 +146,10 @@ def processTrial(session_id, trial_id, trial_type = 'dynamic', error_msg = {} error_msg['error_msg'] = e.args[0] error_msg['error_msg_dev'] = e.args[1] - _ = requests.patch(trial_url, data={"meta": json.dumps(error_msg)}, - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + _ = makeRequestWithRetry('PATCH', + trial_url, + data={"meta": json.dumps(error_msg)}, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) raise Exception('Static trial failed', e.args[0], e.args[1]) if not hasWritePermissions: @@ -233,8 +238,10 @@ def processTrial(session_id, trial_id, trial_type = 'dynamic', error_msg = {} error_msg['error_msg'] = e.args[0] error_msg['error_msg_dev'] = e.args[1] - _ = requests.patch(trial_url, data={"meta": json.dumps(error_msg)}, - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + _ = makeRequestWithRetry('PATCH', + trial_url, + data={"meta": json.dumps(error_msg)}, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) raise Exception('Dynamic trial failed.\n' + error_msg['error_msg_dev'], e.args[0], e.args[1]) if not hasWritePermissions: @@ -352,8 +359,10 @@ def batchReprocess(session_ids,calib_id,static_id,dynamic_trialNames,poseDetecto print('Processing ' + session_id) # check if write permissions (session owner or admin) - permissions = requests.get(API_URL + "sessions/{}/get_session_permission/".format(session_id), - headers = {"Authorization": "Token {}".format(API_TOKEN)}).json() + response = makeRequestWithRetry('GET', + API_URL + "sessions/{}/get_session_permission/".format(session_id), + headers = {"Authorization": "Token {}".format(API_TOKEN)}) + permissions = response.json() hasWritePermissions = permissions['isAdmin'] or permissions['isOwner'] @@ -373,13 +382,17 @@ def batchReprocess(session_ids,calib_id,static_id,dynamic_trialNames,poseDetecto hasWritePermissions = hasWritePermissions, cameras_to_use=cameras_to_use) statusData = {'status':'done'} - _ = requests.patch(API_URL + "trials/{}/".format(calib_id_toProcess), data=statusData, - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + _ = makeRequestWithRetry('PATCH', + API_URL + "trials/{}/".format(calib_id_toProcess), + data=statusData, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) except Exception as e: print(e) statusData = {'status':'error'} - _ = requests.patch(API_URL + "trials/{}/".format(calib_id_toProcess), data=statusData, - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + _ = makeRequestWithRetry('PATCH', + API_URL + "trials/{}/".format(calib_id_toProcess), + data=statusData, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) if static_id == None: static_id_toProcess = getNeutralTrialID(session_id) @@ -400,17 +413,22 @@ def batchReprocess(session_ids,calib_id,static_id,dynamic_trialNames,poseDetecto batchProcess = True, cameras_to_use=cameras_to_use) statusData = {'status':'done'} - _ = requests.patch(API_URL + "trials/{}/".format(static_id_toProcess), data=statusData, - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + _ = makeRequestWithRetry('PATCH', + API_URL + "trials/{}/".format(static_id_toProcess), + data=statusData, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) except Exception as e: print(e) statusData = {'status':'error'} - _ = requests.patch(API_URL + "trials/{}/".format(static_id_toProcess), data=statusData, - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + _ = makeRequestWithRetry('PATCH', + API_URL + "trials/{}/".format(static_id_toProcess), + data=statusData, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) if dynamic_ids == None: - - session = requests.get(API_URL + "sessions/{}/".format(session_id), - headers = {"Authorization": "Token {}".format(API_TOKEN)}).json() + response = makeRequestWithRetry('GET', + API_URL + "sessions/{}/".format(session_id), + headers = {"Authorization": "Token {}".format(API_TOKEN)}) + session = response.json() dynamic_ids_toProcess = [t['id'] for t in session['trials'] if (t['name'] != 'calibration' and t['name'] !='neutral')] else: if type(dynamic_ids) == str: @@ -433,13 +451,17 @@ def batchReprocess(session_ids,calib_id,static_id,dynamic_trialNames,poseDetecto cameras_to_use=cameras_to_use) statusData = {'status':'done'} - _ = requests.patch(API_URL + "trials/{}/".format(dID), data=statusData, - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + _ = makeRequestWithRetry('PATCH', + API_URL + "trials/{}/".format(dID), + data=statusData, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) except Exception as e: print(e) statusData = {'status':'error'} - _ = requests.patch(API_URL + "trials/{}/".format(dID), data=statusData, - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + _ = makeRequestWithRetry('PATCH', + API_URL + "trials/{}/".format(dID), + data=statusData, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) def runTestSession(pose='all',isDocker=True): trials = {} From 57e14f41658b5027cd65707bf543aa644a98e947 Mon Sep 17 00:00:00 2001 From: carmichaelong Date: Fri, 22 Nov 2024 15:10:58 -0800 Subject: [PATCH 07/19] do not use retries for dequeue loop in app --- app.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/app.py b/app.py index 5b7d26c4..9bceb8ee 100644 --- a/app.py +++ b/app.py @@ -67,9 +67,8 @@ # no query string -> defaults to 'all' queue_path = "trials/dequeue/?workerType=" + workerType try: - r = makeRequestWithRetry('GET', - "{}{}".format(API_URL, queue_path), - headers = {"Authorization": "Token {}".format(API_TOKEN)}) + r = requests.get("{}{}".format(API_URL, queue_path), + headers = {"Authorization": "Token {}".format(API_TOKEN)}) except Exception as e: traceback.print_exc() time.sleep(15) @@ -189,4 +188,4 @@ folders = glob.glob(os.path.join(getDataDirectory(isDocker=True),'Data','*')) for f in folders: shutil.rmtree(f) - logging.info('deleting ' + f) \ No newline at end of file + logging.info('deleting ' + f) From 4bfcf1c57831c08bf82d3f87657fd71bc5369150 Mon Sep 17 00:00:00 2001 From: Alberto Casas Ortiz Date: Tue, 10 Dec 2024 14:27:50 -0800 Subject: [PATCH 08/19] Intercepting error of human detection pose not detected in hrnet. --- utilsDetector.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/utilsDetector.py b/utilsDetector.py index 1e57ddef..31557464 100644 --- a/utilsDetector.py +++ b/utilsDetector.py @@ -319,9 +319,13 @@ def runMMposeVideo( # copy /data/output to pathOutputPkl os.system("cp /data/output_mmpose/* {pathOutputPkl}/".format(pathOutputPkl=pathOutputPkl)) - pkl_path_tmp = os.path.join(pathOutputPkl, 'human.pkl') - os.rename(pkl_path_tmp, pklPath) - + pkl_path_tmp = os.path.join(pathOutputPkl, 'human.pkl') + if os.path.exists(pkl_path_tmp): + os.rename(pkl_path_tmp, pklPath) + else: + raise FileNotFoundError( + "We could not detect any pose in your video. Please verify that the subject is correctly in front of the camera." + ) except Exception as e: if len(e.args) == 2: # specific exception raise Exception(e.args[0], e.args[1]) From 8c1459ca3dac202311ac50654c3d7b73c0e1acc2 Mon Sep 17 00:00:00 2001 From: Antoine Falisse Date: Thu, 12 Dec 2024 16:30:19 +0100 Subject: [PATCH 09/19] support for lying checkerboard --- main.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/main.py b/main.py index aba3d625..678fc630 100644 --- a/main.py +++ b/main.py @@ -303,26 +303,18 @@ def main(sessionName, trialName, trial_id, cameras_to_use=['all'], trialRelativePath = os.path.join('InputMedia', trialName, trial_id) if runPoseDetection: - # Detect if checkerboard is upside down. - upsideDownChecker = isCheckerboardUpsideDown(CamParamDict) # Get rotation angles from motion capture environment to OpenSim. # Space-fixed are lowercase, Body-fixed are uppercase. checkerBoardMount = sessionMetadata['checkerBoard']['placement'] - if checkerBoardMount == 'backWall' and not upsideDownChecker: - rotationAngles = {'y':90, 'z':180} - elif checkerBoardMount == 'backWall' and upsideDownChecker: - rotationAngles = {'y':-90} - elif checkerBoardMount == 'backWall_largeCB': - rotationAngles = {'y':-90} - # TODO: uppercase? - elif checkerBoardMount == 'backWall_walking': - rotationAngles = {'YZ':(-90,180)} - elif checkerBoardMount == 'ground': - rotationAngles = {'x':-90, 'y':90} - elif checkerBoardMount == 'ground_jumps': # for sub1 - rotationAngles = {'x':90, 'y':180} - elif checkerBoardMount == 'ground_gaits': # for sub1 - rotationAngles = {'x':90, 'y':90} + if checkerBoardMount == 'backWall' or checkerBoardMount == 'Perpendicular': + # Detect if checkerboard is upside down. + upsideDownChecker = isCheckerboardUpsideDown(CamParamDict) + if upsideDownChecker: + rotationAngles = {'y':-90} + else: + rotationAngles = {'y':90, 'z':180} + elif checkerBoardMount == 'ground' or checkerBoardMount == 'Lying': + rotationAngles = {'x':90, 'y':90} else: raise Exception('checkerBoard placement value in\ sessionMetadata.yaml is not currently supported') From 0cb03721da28647951dbfec93b83647d2591e514 Mon Sep 17 00:00:00 2001 From: Alberto Casas Ortiz Date: Tue, 14 Jan 2025 16:10:12 -0800 Subject: [PATCH 10/19] Allowing running multiple instances of opencap in one server. --- docker/Makefile | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/docker/Makefile b/docker/Makefile index fda79023..9c2e71e8 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -68,12 +68,9 @@ endif .PHONY: run run: -ifeq ($(CURRENT_BRANCH),$(PROD_BRANCH)) - aws ecr get-login-password --region us-west-2 --profile opencap | docker login --username AWS --password-stdin 660440363484.dkr.ecr.us-west-2.amazonaws.com - -else ifeq ($(CURRENT_BRANCH),$(DEV_BRANCH)) - aws ecr get-login-password --region us-west-2 --profile opencap | docker login --username AWS --password-stdin 660440363484.dkr.ecr.us-west-2.amazonaws.com - -endif - - OPENCAP_IMAGE_NAME=$(OPENCAP_IMAGE_NAME) OPENPOSE_IMAGE_NAME=$(OPENPOSE_IMAGE_NAME) MMPOSE_IMAGE_NAME=$(MMPOSE_IMAGE_NAME) docker-compose up \ No newline at end of file + @if [ -z "$(INSTANCE_ID)" ]; then echo "INSTANCE_ID is required. Use make run INSTANCE_ID="; exit 1; fi + COMPOSE_PROJECT_NAME=opencap_$(INSTANCE_ID) \ + OPENCAP_IMAGE_NAME=$(OPENCAP_IMAGE_NAME) \ + OPENPOSE_IMAGE_NAME=$(OPENPOSE_IMAGE_NAME) \ + MMPOSE_IMAGE_NAME=$(MMPOSE_IMAGE_NAME) \ + docker-compose up From f807e2583c5f4ee2cd9414ab06b7254c28f806d7 Mon Sep 17 00:00:00 2001 From: Alberto Casas Ortiz Date: Tue, 21 Jan 2025 11:13:54 -0800 Subject: [PATCH 11/19] Modified Makefile and docker-compose to allow multiple instances in same machine. --- docker/Makefile | 13 +++++++-- docker/docker-compose.yaml | 9 ++++++ docker/start-script.sh | 58 ++++++++++++++++++++++++++++++++++++++ docker/stop-all-script.sh | 1 + docker/stop-script.sh | 25 ++++++++++++++++ 5 files changed, 104 insertions(+), 2 deletions(-) create mode 100755 docker/start-script.sh create mode 100755 docker/stop-all-script.sh create mode 100755 docker/stop-script.sh diff --git a/docker/Makefile b/docker/Makefile index 9c2e71e8..826d30b8 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -68,9 +68,18 @@ endif .PHONY: run run: - @if [ -z "$(INSTANCE_ID)" ]; then echo "INSTANCE_ID is required. Use make run INSTANCE_ID="; exit 1; fi + @if [ -z "$(INSTANCE_ID)" ]; then \ + echo "INSTANCE_ID is required. Use make run INSTANCE_ID= CPU_SET="; \ + exit 1; \ + fi + @if [ -z "$(CPU_SET)" ]; then \ + echo "CPU_SET is required. Use make run INSTANCE_ID= CPU_SET="; \ + exit 1; \ + fi COMPOSE_PROJECT_NAME=opencap_$(INSTANCE_ID) \ OPENCAP_IMAGE_NAME=$(OPENCAP_IMAGE_NAME) \ OPENPOSE_IMAGE_NAME=$(OPENPOSE_IMAGE_NAME) \ MMPOSE_IMAGE_NAME=$(MMPOSE_IMAGE_NAME) \ - docker-compose up + INSTANCE_ID=$(INSTANCE_ID) \ + CPU_SET=$(CPU_SET) \ + docker compose up -d diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index 169ff406..a78264d8 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -8,6 +8,7 @@ services: - ../.env environment: - DOCKERCOMPOSE=1 + - NVIDIA_VISIBLE_DEVICES=${INSTANCE_ID} deploy: resources: reservations: @@ -15,10 +16,13 @@ services: - driver: nvidia count: 1 capabilities: [gpu] + cpuset: "${CPU_SET}" openpose: image: ${OPENPOSE_IMAGE_NAME} volumes: - data:/openpose/data + environment: + - NVIDIA_VISIBLE_DEVICES=${INSTANCE_ID} deploy: resources: reservations: @@ -26,10 +30,13 @@ services: - driver: nvidia count: 1 capabilities: [gpu] + cpuset: "${CPU_SET}" mmpose: image: ${MMPOSE_IMAGE_NAME} volumes: - data:/mmpose/data + environment: + - NVIDIA_VISIBLE_DEVICES=${INSTANCE_ID} deploy: resources: reservations: @@ -37,5 +44,7 @@ services: - driver: nvidia count: 1 capabilities: [gpu] + cpuset: "${CPU_SET}" + volumes: data: {} diff --git a/docker/start-script.sh b/docker/start-script.sh new file mode 100755 index 00000000..c677a007 --- /dev/null +++ b/docker/start-script.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Configuration +MAX_INSTANCES=8 +CPUS_PER_INSTANCE=14 +GPUS_PER_INSTANCE=1 + +# Get the total number of CPUs and GPUs available +TOTAL_CPUS=$(nproc) +TOTAL_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) + +# Read number of instances to start +if [ -z "$1" ]; then + echo "Usage: $0 " + echo "Provide the number of instances to start (max $MAX_INSTANCES)." + exit 1 +fi + +NUM_INSTANCES=$1 + +# Validate the number of instances +if (( NUM_INSTANCES > MAX_INSTANCES )); then + echo "Error: Maximum number of instances is $MAX_INSTANCES." + exit 1 +fi + +# Check if there are enough resources +if (( NUM_INSTANCES * CPUS_PER_INSTANCE > TOTAL_CPUS )); then + echo "Error: Not enough CPUs. Required: $((NUM_INSTANCES * CPUS_PER_INSTANCE)), Available: $TOTAL_CPUS." + exit 1 +fi + +if (( NUM_INSTANCES * GPUS_PER_INSTANCE > TOTAL_GPUS )); then + echo "Error: Not enough GPUs. Required: $((NUM_INSTANCES * GPUS_PER_INSTANCE)), Available: $TOTAL_GPUS." + exit 1 +fi + +# Display summary +echo "Starting $NUM_INSTANCES instances..." +echo "Total CPUs: $TOTAL_CPUS (using $CPUS_PER_INSTANCE per instance)" +echo "Total GPUs: $TOTAL_GPUS (using $GPUS_PER_INSTANCE per instance)" +echo + +# Start instances +for (( i=0; i" + exit 1 +fi + +INSTANCE_ID=$1 +COMPOSE_PROJECT_NAME="opencap_${INSTANCE_ID}" + +echo "Stopping and removing containers for INSTANCE_ID=${INSTANCE_ID}..." + +# Stop and remove containers associated with the project +docker-compose \ + --project-name $COMPOSE_PROJECT_NAME \ + down + +# Verify if containers are removed +if [ $? -eq 0 ]; then + echo "Successfully stopped and removed containers for INSTANCE_ID=${INSTANCE_ID}." +else + echo "Failed to stop and remove containers for INSTANCE_ID=${INSTANCE_ID}." +fi + From 98a385db50456a2ed6cd058f42c3dc056b84dbcf Mon Sep 17 00:00:00 2001 From: Alberto Casas Ortiz Date: Thu, 23 Jan 2025 12:27:25 -0800 Subject: [PATCH 12/19] Renamed scripts. Added script to stop all. --- docker/{start-script.sh => start-containers.sh} | 4 +++- docker/stop-all-containers.sh | 1 + docker/stop-all-script.sh | 1 - docker/{stop-script.sh => stop-container.sh} | 0 4 files changed, 4 insertions(+), 2 deletions(-) rename docker/{start-script.sh => start-containers.sh} (99%) create mode 100755 docker/stop-all-containers.sh delete mode 100755 docker/stop-all-script.sh rename docker/{stop-script.sh => stop-container.sh} (100%) diff --git a/docker/start-script.sh b/docker/start-containers.sh similarity index 99% rename from docker/start-script.sh rename to docker/start-containers.sh index c677a007..6b53a404 100755 --- a/docker/start-script.sh +++ b/docker/start-containers.sh @@ -49,9 +49,11 @@ for (( i=0; i Date: Tue, 28 Jan 2025 11:12:01 -0800 Subject: [PATCH 13/19] Added script to check container status and start single containers. --- docker/check-containers-health.sh | 34 +++++++++++++++++++++ docker/start-container.sh | 51 +++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100755 docker/check-containers-health.sh create mode 100755 docker/start-container.sh diff --git a/docker/check-containers-health.sh b/docker/check-containers-health.sh new file mode 100755 index 00000000..e87cfc0c --- /dev/null +++ b/docker/check-containers-health.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# Function to check if a container is running +is_container_alive() { + local container_name=$1 + docker ps --filter "name=^/${container_name}$" --filter "status=running" --format '{{.Names}}' | grep -wq "$container_name" + return $? +} + +# Loop through numbers 0 to 7 +for n in {0..7}; do + # Container names + opencap_openpose="opencap_${n}-openpose-1" + opencap_mmpose="opencap_${n}-mmpose-1" + opencap_mobilecap="opencap_${n}-mobilecap-1" + + # Check if all three containers are alive + if is_container_alive "$opencap_openpose" && \ + is_container_alive "$opencap_mmpose" && \ + is_container_alive "$opencap_mobilecap"; then + echo "All containers for instance $n are alive. Skipping." + continue + fi + + # Check if any container exists + if docker ps -a --filter "name=^/opencap_${n}-(openpose|mmpose|mobilecap)-1$" --format '{{.Names}}' | grep -q "opencap_${n}"; then + echo "Some containers for instance $n are not alive. Stopping instance." + ./stop-container.sh "$n" + ./start-container.sh "$n" + else + echo "No containers for instance $n. Skipping." + fi + +done diff --git a/docker/start-container.sh b/docker/start-container.sh new file mode 100755 index 00000000..18a3dd36 --- /dev/null +++ b/docker/start-container.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# Configuration +MAX_INSTANCES=8 +CPUS_PER_INSTANCE=14 +GPUS_PER_INSTANCE=1 + +# Get the total number of CPUs and GPUs available +TOTAL_CPUS=$(nproc) +TOTAL_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) + +# Check if an instance number is provided +if [ -z "$1" ]; then + echo "Usage: $0 " + echo "Provide the instance number to start (0 to $((MAX_INSTANCES - 1)))." + exit 1 +fi + +INSTANCE_NUMBER=$1 + +# Validate the instance number +if (( INSTANCE_NUMBER < 0 || INSTANCE_NUMBER >= MAX_INSTANCES )); then + echo "Error: Instance number must be between 0 and $((MAX_INSTANCES - 1))." + exit 1 +fi + +# Compute CPU and GPU offsets for the selected instance +CPU_START=$(( INSTANCE_NUMBER * CPUS_PER_INSTANCE )) +CPU_END=$(( CPU_START + CPUS_PER_INSTANCE - 1 )) +CPU_SET="${CPU_START}-${CPU_END}" + +# Validate resource availability +if (( CPU_START + CPUS_PER_INSTANCE > TOTAL_CPUS )); then + echo "Error: Not enough CPUs available for instance $INSTANCE_NUMBER." + exit 1 +fi + +if (( INSTANCE_NUMBER >= TOTAL_GPUS )); then + echo "Error: Not enough GPUs available for instance $INSTANCE_NUMBER." + exit 1 +fi + +# Start the specific instance +echo "Starting instance $INSTANCE_NUMBER with CPU_SET=${CPU_SET} and GPU=${INSTANCE_NUMBER}" + +# Run docker-compose for the specific instance +make run INSTANCE_ID=$INSTANCE_NUMBER CPU_SET=$CPU_SET + +sleep 10 + +echo "Instance $INSTANCE_NUMBER started successfully." From 666e1654bca971b90709cf36c5d82470690d4d7c Mon Sep 17 00:00:00 2001 From: Alberto Casas Ortiz Date: Tue, 28 Jan 2025 15:52:07 -0800 Subject: [PATCH 14/19] Added ugly logging to docker containers. Added try catch to finally in app.py --- app.py | 7 +++++-- docker/docker-compose.yaml | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/app.py b/app.py index 9bceb8ee..42dfd813 100644 --- a/app.py +++ b/app.py @@ -178,8 +178,11 @@ finally: # End process duration timer and post duration to database - process_end_time = datetime.now() - postProcessedDuration(trial_url, process_end_time - process_start_time) + try: + process_end_time = datetime.now() + postProcessedDuration(trial_url, process_end_time - process_start_time) + except Exception as e: + traceback.print_exc() justProcessed = True diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index a78264d8..4b48f969 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -17,6 +17,11 @@ services: count: 1 capabilities: [gpu] cpuset: "${CPU_SET}" + logging: + driver: "json-file" + options: + max-size: "100m" # Rotate when the log reaches 10MB + max-file: "7" # Keep the last 7 log files openpose: image: ${OPENPOSE_IMAGE_NAME} volumes: @@ -31,6 +36,11 @@ services: count: 1 capabilities: [gpu] cpuset: "${CPU_SET}" + logging: + driver: "json-file" + options: + max-size: "100m" # Rotate when the log reaches 10MB + max-file: "7" # Keep the last 7 log files mmpose: image: ${MMPOSE_IMAGE_NAME} volumes: @@ -45,6 +55,11 @@ services: count: 1 capabilities: [gpu] cpuset: "${CPU_SET}" + logging: + driver: "json-file" + options: + max-size: "100m" # Rotate when the log reaches 10MB + max-file: "7" # Keep the last 7 log files volumes: data: {} From f42a6026fb2597ec59b013c3d5c94ca8194f1f1c Mon Sep 17 00:00:00 2001 From: Alberto Casas Ortiz Date: Thu, 30 Jan 2025 14:48:11 -0800 Subject: [PATCH 15/19] catch exception if patch fails in except block --- app.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/app.py b/app.py index 42dfd813..2226cceb 100644 --- a/app.py +++ b/app.py @@ -160,10 +160,13 @@ time.sleep(0.5) except Exception as e: - r = makeRequestWithRetry('PATCH', - trial_url, data={"status": "error"}, - headers = {"Authorization": "Token {}".format(API_TOKEN)}) - traceback.print_exc() + try: + r = makeRequestWithRetry('PATCH', + trial_url, data={"status": "error"}, + headers = {"Authorization": "Token {}".format(API_TOKEN)}) + traceback.print_exc() + except: + traceback.print_exc() # Antoine: Removing this, it is too often causing the machines to stop. Not because # the machines are failing, but because for instance the video is very long with a lot From b14f5bc38b0a77386bf8dbf58fb187e661f47fa4 Mon Sep 17 00:00:00 2001 From: carmichaelong Date: Mon, 3 Feb 2025 18:08:14 -0800 Subject: [PATCH 16/19] use different gpu for each instance --- docker/docker-compose.yaml | 11 +++-------- docker/stop-all-containers.sh | 2 +- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index 4b48f969..55acdbdc 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -8,14 +8,13 @@ services: - ../.env environment: - DOCKERCOMPOSE=1 - - NVIDIA_VISIBLE_DEVICES=${INSTANCE_ID} deploy: resources: reservations: devices: - driver: nvidia - count: 1 capabilities: [gpu] + device_ids: ["${INSTANCE_ID}"] cpuset: "${CPU_SET}" logging: driver: "json-file" @@ -26,15 +25,13 @@ services: image: ${OPENPOSE_IMAGE_NAME} volumes: - data:/openpose/data - environment: - - NVIDIA_VISIBLE_DEVICES=${INSTANCE_ID} deploy: resources: reservations: devices: - driver: nvidia - count: 1 capabilities: [gpu] + device_ids: ["${INSTANCE_ID}"] cpuset: "${CPU_SET}" logging: driver: "json-file" @@ -45,15 +42,13 @@ services: image: ${MMPOSE_IMAGE_NAME} volumes: - data:/mmpose/data - environment: - - NVIDIA_VISIBLE_DEVICES=${INSTANCE_ID} deploy: resources: reservations: devices: - driver: nvidia - count: 1 capabilities: [gpu] + device_ids: ["${INSTANCE_ID}"] cpuset: "${CPU_SET}" logging: driver: "json-file" diff --git a/docker/stop-all-containers.sh b/docker/stop-all-containers.sh index eab78ea0..21c4441b 100755 --- a/docker/stop-all-containers.sh +++ b/docker/stop-all-containers.sh @@ -1 +1 @@ -for i in $(seq 0 8); do ./stop-container.sh $i; done +for i in $(seq 0 7); do ./stop-container.sh $i; done From 638807e0e3e5f206c9611d2ba5ddaeddbd05fc21 Mon Sep 17 00:00:00 2001 From: carmichaelong Date: Tue, 4 Feb 2025 17:25:05 -0800 Subject: [PATCH 17/19] add error logging json file (optional) for machines running app.py --- app.py | 24 ++++++++++++++++++++++-- utils.py | 30 ++++++++++++++++++++++++++++++ utilsAPI.py | 3 +++ 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/app.py b/app.py index 2226cceb..a1ce13a9 100644 --- a/app.py +++ b/app.py @@ -9,12 +9,13 @@ import glob from datetime import datetime, timedelta import numpy as np -from utilsAPI import getAPIURL, getWorkerType, getASInstance, unprotect_current_instance, get_number_of_pending_trials +from utilsAPI import getAPIURL, getWorkerType, getErrorLogBool, getASInstance, unprotect_current_instance, get_number_of_pending_trials from utilsAuth import getToken from utils import (getDataDirectory, checkTime, checkResourceUsage, sendStatusEmail, checkForTrialsWithStatus, getCommitHash, getHostname, postLocalClientInfo, - postProcessedDuration, makeRequestWithRetry) + postProcessedDuration, makeRequestWithRetry, + writeToErrorLog) logging.basicConfig(level=logging.INFO) @@ -24,6 +25,9 @@ autoScalingInstance = getASInstance() logging.info(f"AUTOSCALING TEST INSTANCE: {autoScalingInstance}") +ERROR_LOG = getErrorLogBool() +error_log_path = "/data/error_log.json" + # if true, will delete entire data directory when finished with a trial isDocker = True @@ -165,9 +169,20 @@ trial_url, data={"status": "error"}, headers = {"Authorization": "Token {}".format(API_TOKEN)}) traceback.print_exc() + + if ERROR_LOG: + stack = traceback.format_exc() + writeToErrorLog(error_log_path, trial["session"], trial["id"], + e, stack) + except: traceback.print_exc() + if ERROR_LOG: + stack = traceback.format_exc() + writeToErrorLog(error_log_path, trial["session"], trial["id"], + e, stack) + # Antoine: Removing this, it is too often causing the machines to stop. Not because # the machines are failing, but because for instance the video is very long with a lot # of people in it. We should not stop the machine for that. Originally the check was @@ -187,6 +202,11 @@ except Exception as e: traceback.print_exc() + if ERROR_LOG: + stack = traceback.format_exc() + writeToErrorLog(error_log_path, trial["session"], trial["id"], + e, stack) + justProcessed = True # Clean data directory diff --git a/utils.py b/utils.py index 2b140eed..45bd613d 100644 --- a/utils.py +++ b/utils.py @@ -12,6 +12,7 @@ import subprocess import zipfile import time +import datetime import numpy as np import pandas as pd @@ -1585,6 +1586,35 @@ def checkCudaTF(): sendStatusEmail(message=message) raise Exception("No GPU detected. Exiting.") +def writeToJsonLog(path, new_dict, max_entries=1000): + dir_name = os.path.dirname(path) + if not os.path.exists(dir_name): + os.makedirs(dir_name) + + if os.path.exists(path): + with open(path, 'r') as f: + data = json.load(f) + else: + data = [] + + data.append(new_dict) + + while len(data) > max_entries: + data.pop(0) + + with open(path, 'w') as f: + json.dump(data, f) + +def writeToErrorLog(path, session_id, trial_id, error, stack, max_entries=1000): + error_entry = { + 'session_id': session_id, + 'trial_id': trial_id, + 'datetime': datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), + 'error': str(error), + 'stack': stack + } + writeToJsonLog(path, error_entry, max_entries) + # %% Some functions for loading subject data def getSubjectNumber(subjectName): diff --git a/utilsAPI.py b/utilsAPI.py index 982bf4f7..1535406c 100644 --- a/utilsAPI.py +++ b/utilsAPI.py @@ -48,6 +48,9 @@ def getStatusEmails(): return emailInfo +def getErrorLogBool(): + return config('ERROR_LOG', default=False, cast=bool) + def getASInstance(): try: # Check if the ECS_CONTAINER_METADATA_FILE environment variable exists From 7d7e215f045138d9b7706cbaf35e2e9c0e9f24e0 Mon Sep 17 00:00:00 2001 From: carmichaelong Date: Wed, 5 Feb 2025 15:56:36 -0800 Subject: [PATCH 18/19] set defaults for INSTANCE_ID and CPU_SET to restore make run usage --- docker/Makefile | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/docker/Makefile b/docker/Makefile index 826d30b8..ff2197b5 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -3,6 +3,10 @@ REPO_NAME := opencap PROD_BRANCH := main DEV_BRANCH := dev +# Initialize variables if not passed in +INSTANCE_ID ?= 0 +CPU_SET ?= "" + # Determine the branch name CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD) @@ -68,14 +72,9 @@ endif .PHONY: run run: - @if [ -z "$(INSTANCE_ID)" ]; then \ - echo "INSTANCE_ID is required. Use make run INSTANCE_ID= CPU_SET="; \ - exit 1; \ - fi - @if [ -z "$(CPU_SET)" ]; then \ - echo "CPU_SET is required. Use make run INSTANCE_ID= CPU_SET="; \ - exit 1; \ - fi + @echo "Usage: sudo make run INSTANCE_ID= CPU_SET=" + @echo "Defaults: INSTANCE_ID=0, CPU_SET=\"\"" + COMPOSE_PROJECT_NAME=opencap_$(INSTANCE_ID) \ OPENCAP_IMAGE_NAME=$(OPENCAP_IMAGE_NAME) \ OPENPOSE_IMAGE_NAME=$(OPENPOSE_IMAGE_NAME) \ From 116e8d4702876327005be026e6c9206e261ff617 Mon Sep 17 00:00:00 2001 From: carmichaelong Date: Thu, 6 Feb 2025 16:29:18 -0800 Subject: [PATCH 19/19] retry test session with random waiting time, simplify makeRequestWithRetry error handling --- docker/start-containers.sh | 2 +- utils.py | 44 ++++++++----------- utilsServer.py | 90 +++++++++++++++++++++++++------------- 3 files changed, 80 insertions(+), 56 deletions(-) diff --git a/docker/start-containers.sh b/docker/start-containers.sh index 6b53a404..fb281e39 100755 --- a/docker/start-containers.sh +++ b/docker/start-containers.sh @@ -53,7 +53,7 @@ for (( i=0; i