From dd5abd7a31d4fe4cb8602d0430291129210f09f7 Mon Sep 17 00:00:00 2001 From: kravetsmic Date: Mon, 22 May 2023 13:16:17 +0300 Subject: [PATCH 1/4] feat(api): refinement of mixed file requests --- .../api/test_file_apis.py | 106 ++++++++++++++++-- .../api/test_file_text_apis.py | 81 +++++++++---- .../api/test_text_apis.py | 42 +++++++ .../api/process_file_1.py | 6 + .../api/process_file_2.py | 8 +- .../api/process_file_3.py | 8 +- .../api/process_file_4.py | 6 + .../api/process_file_5.py | 6 + .../api/process_text_1.py | 8 +- .../api/process_text_2.py | 8 +- .../api/process_text_3.py | 8 +- .../api/process_text_4.py | 8 +- .../api/process_text_file_1.py | 14 +++ .../api/process_text_file_2.py | 14 +++ .../api/process_text_file_3.py | 14 +++ .../api/process_text_file_4.py | 14 +++ .../pipelines/templates/pipeline_api.txt | 35 ++++-- 17 files changed, 344 insertions(+), 42 deletions(-) diff --git a/test_unstructured_api_tools/api/test_file_apis.py b/test_unstructured_api_tools/api/test_file_apis.py index a97bdbf..3e0ea8b 100644 --- a/test_unstructured_api_tools/api/test_file_apis.py +++ b/test_unstructured_api_tools/api/test_file_apis.py @@ -31,6 +31,8 @@ GZIP_FILE_IMAGE, GZIP_FILE_DOCX, FILE_MARKDOWN, + FILE_TXT_1, + GZIP_FILE_TXT_1, ) # accepts: files, input2 @@ -224,6 +226,9 @@ def _json_for_one_file(test_file): ([], P_INPUT_1_EMPTY, JSON, 400, None), ([GZIP_FILE_DOCX], P_INPUT_1_EMPTY, JSON, 200, FILENAME_FORMATS[FILE_DOCX]), ([GZIP_FILE_DOCX], P_INPUT_1_EMPTY, JSON, 200, FILENAME_FORMATS[FILE_IMAGE]), + ([FILE_TXT_1], P_INPUT_1_EMPTY, JSON, 400, None), + ([FILE_DOCX, FILE_IMAGE, FILE_TXT_1], P_INPUT_1_EMPTY, JSON, 400, None), + ([FILE_DOCX, GZIP_FILE_TXT_1], P_INPUT_1_EMPTY, JSON, 400, None), ], ) def test_process_file_1( @@ -235,6 +240,8 @@ def test_process_file_1( data = test_params if gz_content_type: data["gz_uncompressed_content_type"] = gz_content_type + else: + data["gz_uncompressed_content_type"] = None response = client.post( endpoint, @@ -270,13 +277,15 @@ def test_process_file_1( ([FILE_DOCX, GZIP_FILE_IMAGE], MIXED, 200, None, False, None), ([GZIP_FILE_DOCX, GZIP_FILE_IMAGE], MIXED, 200, None, False, None), ([GZIP_FILE_DOCX, GZIP_FILE_IMAGE], TEXT_CSV, 406, None, False, None), - ([FILE_MARKDOWN, GZIP_FILE_IMAGE], JSON, 200, None, False, None), - ([FILE_MARKDOWN], JSON, 200, None, False, None), - ([FILE_MARKDOWN], JSON, 200, None, True, None), + ([FILE_MARKDOWN, GZIP_FILE_IMAGE], JSON, 400, None, False, None), + ([FILE_MARKDOWN], JSON, 400, None, False, None), + ([FILE_MARKDOWN], JSON, 400, None, True, None), ([FILE_MSG], JSON, 200, None, True, None), ([FILE_JSON], JSON, 200, None, True, None), ([GZIP_FILE_DOCX], JSON, 200, None, False, FILENAME_FORMATS[FILE_DOCX]), ([GZIP_FILE_DOCX], JSON, 200, None, False, FILENAME_FORMATS[FILE_IMAGE]), + ([GZIP_FILE_DOCX, GZIP_FILE_TXT_1], JSON, 400, None, False, None), + ([FILE_TXT_1], JSON, 400, None, False, None), ], ) def test_process_file_2( @@ -421,12 +430,12 @@ def test_process_file_2( None, None, ), - ([FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_LABELSTUDIO, 200, True, None, None), + ([FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_LABELSTUDIO, 400, True, None, None), ( [FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_LABELSTUDIO, - 200, + 400, False, FILENAME_FORMATS[FILE_MARKDOWN], None, @@ -459,6 +468,33 @@ def test_process_file_2( None, FILENAME_FORMATS[FILE_IMAGE], ), + ( + [GZIP_FILE_TXT_1], + JSON, + RESPONSE_SCHEMA_ISD, + 400, + False, + None, + None, + ), + ( + [FILE_DOCX, FILE_TXT_1], + JSON, + RESPONSE_SCHEMA_LABELSTUDIO, + 400, + False, + None, + None, + ), + ( + [FILE_TXT_1], + JSON, + RESPONSE_SCHEMA_LABELSTUDIO, + 400, + False, + None, + None, + ), ], ) def test_process_file_3( @@ -630,7 +666,7 @@ def test_process_file_3( False, None, ), - ([FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_ISD, P_INPUT_1_SINGLE, 200, None, True, None), + ([FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_ISD, P_INPUT_1_SINGLE, 400, None, True, None), ( [GZIP_FILE_DOCX, GZIP_FILE_IMAGE], MIXED, @@ -663,6 +699,36 @@ def test_process_file_3( False, FILENAME_FORMATS[FILE_IMAGE], ), + ( + [GZIP_FILE_TXT_1], + JSON, + RESPONSE_SCHEMA_ISD, + P_INPUT_1_EMPTY, + 400, + None, + False, + None, + ), + ( + [FILE_TXT_1], + JSON, + RESPONSE_SCHEMA_LABELSTUDIO, + P_INPUT_1_EMPTY, + 400, + None, + False, + None, + ), + ( + [FILE_DOCX, FILE_MARKDOWN], + JSON, + RESPONSE_SCHEMA_ISD, + P_INPUT_1_EMPTY, + 400, + None, + False, + None, + ), ], ) def test_process_file_4( @@ -871,7 +937,7 @@ def test_process_file_4( RESPONSE_SCHEMA_LABELSTUDIO, P_INPUT_1_MULTI, P_INPUT_2_EMPTY, - 200, + 400, False, None, None, @@ -882,7 +948,7 @@ def test_process_file_4( RESPONSE_SCHEMA_LABELSTUDIO, P_INPUT_1_MULTI, P_INPUT_2_EMPTY, - 200, + 400, True, None, None, @@ -893,7 +959,7 @@ def test_process_file_4( RESPONSE_SCHEMA_LABELSTUDIO, P_INPUT_1_MULTI, P_INPUT_2_EMPTY, - 200, + 400, False, FILENAME_FORMATS[FILE_MARKDOWN], None, @@ -975,6 +1041,28 @@ def test_process_file_4( None, FILENAME_FORMATS[FILE_IMAGE], ), + ( + [GZIP_FILE_DOCX, FILE_TXT_1], + JSON, + RESPONSE_SCHEMA_ISD, + P_INPUT_1_EMPTY, + P_INPUT_2_EMPTY, + 400, + False, + None, + None + ), + ( + [FILE_TXT_1], + JSON, + RESPONSE_SCHEMA_LABELSTUDIO, + P_INPUT_1_EMPTY, + P_INPUT_2_EMPTY, + 400, + False, + None, + None, + ), ], ) def test_process_file_5( diff --git a/test_unstructured_api_tools/api/test_file_text_apis.py b/test_unstructured_api_tools/api/test_file_text_apis.py index 2c8eda8..0b760ba 100644 --- a/test_unstructured_api_tools/api/test_file_text_apis.py +++ b/test_unstructured_api_tools/api/test_file_text_apis.py @@ -311,18 +311,18 @@ def _json_for_one_file(test_file=None, test_text_file=None): ([FILE_DOCX, FILE_IMAGE], [GZIP_FILE_TXT_1, GZIP_FILE_TXT_2], 200, JSON, False, None, None), ([FILE_DOCX], [GZIP_FILE_TXT_2], 200, JSON, False, None, None), ([GZIP_FILE_IMAGE], [GZIP_FILE_TXT_1], 200, JSON, False, None, None), - ([FILE_MARKDOWN], [GZIP_FILE_TXT_1], 200, JSON, True, None, None), + ([FILE_MARKDOWN], [GZIP_FILE_TXT_1], 400, JSON, True, None, None), ( [FILE_MARKDOWN], [GZIP_FILE_TXT_1], - 200, + 400, JSON, False, f"{FILENAME_FORMATS[FILE_MARKDOWN]},{FILENAME_FORMATS[FILE_TXT_1]}", None, ), ([FILE_MARKDOWN], [GZIP_FILE_TXT_1], 400, JSON, False, FILENAME_FORMATS[FILE_TXT_1], None), - ([FILE_MARKDOWN, FILE_DOCX], [GZIP_FILE_TXT_1, FILE_TXT_2], 200, MIXED, False, None, None), + ([FILE_DOCX], [GZIP_FILE_TXT_1, FILE_TXT_2, FILE_MARKDOWN], 200, MIXED, False, None, None), ([], [], 400, JSON, False, None, None), ( [FILE_MARKDOWN, FILE_DOCX], @@ -337,6 +337,9 @@ def _json_for_one_file(test_file=None, test_text_file=None): ([FILE_DOCX], [], 200, JSON, False, None, None), ([GZIP_FILE_DOCX], [FILE_TXT_1], 200, JSON, False, None, FILENAME_FORMATS[FILE_DOCX]), ([GZIP_FILE_IMAGE], [], 200, JSON, False, None, FILENAME_FORMATS[FILE_IMAGE]), + ([FILE_TXT_1], [], 400, JSON, False, None, None), + ([], [FILE_DOCX], 400, JSON, False, None, None), + ([FILE_DOCX, FILE_IMAGE, FILE_MARKDOWN], [FILE_TXT_1], 400, JSON, False, None, None) ], ) def test_process_file_text_1( @@ -431,10 +434,10 @@ def test_process_file_text_1( ([GZIP_FILE_IMAGE], [GZIP_FILE_TXT_1], JSON, P_INPUT_2_MULTI, 200, False, None, None), ([], [FILE_TXT_1], TEXT_CSV, P_INPUT_2_EMPTY, 406, False, None, None), ([], [FILE_TXT_1], JSON, P_INPUT_2_EMPTY, 200, False, None, None), - ([FILE_MARKDOWN], [FILE_TXT_1], JSON, P_INPUT_2_EMPTY, 200, True, None, None), + ([], [FILE_TXT_1, FILE_MARKDOWN], JSON, P_INPUT_2_EMPTY, 200, True, None, None), ( - [FILE_MARKDOWN], - [FILE_TXT_1], + [], + [FILE_TXT_1, FILE_MARKDOWN], JSON, P_INPUT_2_MULTI, 200, @@ -443,8 +446,8 @@ def test_process_file_text_1( None, ), ( - [FILE_MARKDOWN], - [FILE_TXT_1], + [], + [FILE_TXT_1, FILE_MARKDOWN], JSON, P_INPUT_2_SINGLE, 400, @@ -453,7 +456,7 @@ def test_process_file_text_1( None, ), ([], [], JSON, P_INPUT_2_EMPTY, 400, False, None, None), - ([FILE_MARKDOWN], [FILE_TXT_1], TEXT_CSV, P_INPUT_2_MULTI, 406, False, None, None), + ([], [FILE_TXT_1, FILE_MARKDOWN], TEXT_CSV, P_INPUT_2_MULTI, 406, False, None, None), ([], [FILE_TXT_1], JSON, P_INPUT_2_SINGLE, 200, False, None, None), ([FILE_DOCX], [], JSON, P_INPUT_2_SINGLE, 200, False, None, None), ([], [FILE_TXT_1], MIXED, P_INPUT_2_EMPTY, 200, False, None, None), @@ -477,6 +480,46 @@ def test_process_file_text_1( None, FILENAME_FORMATS[FILE_IMAGE], ), + ( + [GZIP_FILE_DOCX, GZIP_FILE_TXT_1], + [FILE_TXT_2], + JSON, + P_INPUT_2_SINGLE, + 400, + False, + None, + None + ), + ( + [GZIP_FILE_DOCX], + [GZIP_FILE_IMAGE], + JSON, + P_INPUT_2_MULTI, + 400, + False, + None, + None, + ), + ( + [], + [FILE_MARKDOWN], + JSON, + P_INPUT_2_EMPTY, + 200, + True, + None, + None, + ), + ( + [], + [], + JSON, + P_INPUT_1_EMPTY, + 400, + False, + None, + None, + ), ], ) def test_process_file_text_2( @@ -645,8 +688,8 @@ def test_process_file_text_2( ([], [GZIP_FILE_TXT_1], TEXT_CSV, RESPONSE_SCHEMA_LABELSTUDIO, 406, False, None, None), ([], [GZIP_FILE_TXT_1], JSON, RESPONSE_SCHEMA_LABELSTUDIO, 200, False, None, None), ( - [FILE_DOCX, FILE_MARKDOWN], - [GZIP_FILE_TXT_1], + [FILE_DOCX], + [GZIP_FILE_TXT_1, FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_ISD, 200, @@ -665,8 +708,8 @@ def test_process_file_text_2( None, ), ( - [FILE_DOCX, FILE_MARKDOWN], - [GZIP_FILE_TXT_1], + [FILE_DOCX], + [GZIP_FILE_TXT_1, FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_ISD, 400, @@ -985,8 +1028,8 @@ def test_process_file_text_3( None, ), ( - [FILE_MARKDOWN], - [FILE_TXT_2], + [], + [FILE_TXT_2, FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_ISD, P_INPUT_1_MULTI, @@ -997,8 +1040,8 @@ def test_process_file_text_3( None, ), ( - [FILE_MARKDOWN], - [FILE_TXT_2], + [], + [FILE_TXT_2, FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_ISD, P_INPUT_1_MULTI, @@ -1009,8 +1052,8 @@ def test_process_file_text_3( None, ), ( - [FILE_MARKDOWN], - [FILE_TXT_2], + [], + [FILE_TXT_2, FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_ISD, P_INPUT_1_MULTI, diff --git a/test_unstructured_api_tools/api/test_text_apis.py b/test_unstructured_api_tools/api/test_text_apis.py index cf64307..685d1e3 100644 --- a/test_unstructured_api_tools/api/test_text_apis.py +++ b/test_unstructured_api_tools/api/test_text_apis.py @@ -27,6 +27,8 @@ GZIP_FILE_TXT_2, FILE_MARKDOWN, FILENAME_FORMATS, + FILE_DOCX, + GZIP_FILE_DOCX, ) # accepts: text files @@ -164,6 +166,9 @@ def _json_for_one_file(test_file): ([], 400, False, None, JSON, None), ([GZIP_FILE_TXT_1], 200, False, None, JSON, None), ([GZIP_FILE_TXT_1], 200, False, None, JSON, FILENAME_FORMATS[FILE_TXT_1]), + ([FILE_DOCX], 400, False, None, JSON, None), + ([GZIP_FILE_DOCX], 400, False, None, JSON, None), + ([FILE_TXT_1, FILE_DOCX], 400, False, None, JSON, None), ], ) def test_process_text_1( @@ -344,6 +349,36 @@ def test_process_text_1( JSON, FILENAME_FORMATS[FILE_TXT_1], ), + ( + [FILE_DOCX], + P_INPUT_1_EMPTY, + P_INPUT_2_EMPTY, + 400, + False, + None, + JSON, + None, + ), + ( + [GZIP_FILE_DOCX], + P_INPUT_1_EMPTY, + P_INPUT_2_EMPTY, + 400, + False, + None, + JSON, + None, + ), + ( + [FILE_TXT_1, GZIP_FILE_DOCX], + P_INPUT_1_EMPTY, + P_INPUT_2_EMPTY, + 400, + False, + None, + JSON, + None, + ) ], ) def test_process_text_2( @@ -434,6 +469,9 @@ def test_process_text_2( ([FILE_TXT_2, FILE_MARKDOWN], None, 406, False, FILENAME_FORMATS[FILE_TXT_1], None), ([], None, 400, False, None, None), ([GZIP_FILE_TXT_1], JSON, 200, False, None, FILENAME_FORMATS[FILE_TXT_1]), + ([FILE_DOCX], JSON, 400, False, None, None), + ([FILE_TXT_1, GZIP_FILE_DOCX], JSON, 400, False, None, None), + ([FILE_TXT_1, FILE_TXT_1, FILE_DOCX], JSON, 400, False, None, None), ], ) def test_process_text_3( @@ -522,6 +560,10 @@ def test_process_text_3( None, FILENAME_FORMATS[FILE_TXT_1], ), + ([FILE_DOCX], JSON, RESPONSE_SCHEMA_ISD, 400, False, None, None), + ([FILE_TXT_1, FILE_DOCX], JSON, RESPONSE_SCHEMA_ISD, 400, False, None, None), + ([GZIP_FILE_DOCX], JSON, RESPONSE_SCHEMA_LABELSTUDIO, 400, False, None, None), + ([FILE_TXT_1, GZIP_FILE_TXT_1, GZIP_FILE_DOCX], JSON, RESPONSE_SCHEMA_ISD, 400, False, None, None) ], ) def test_process_text_4( diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_1.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_1.py index b982ef6..095e373 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_1.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_1.py @@ -174,6 +174,12 @@ def response_generator(is_multipart): for file in files: file_content_type = get_validated_mimetype(file) + if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + _file = file.file response = pipeline_api( diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_2.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_2.py index 61475e3..e3dee1b 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_2.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_2.py @@ -159,7 +159,13 @@ def pipeline_1( def response_generator(is_multipart): for file in files: - get_validated_mimetype(file) + file_content_type = get_validated_mimetype(file) + + if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) _file = file.file diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_3.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_3.py index 7408d07..dd18461 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_3.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_3.py @@ -182,7 +182,13 @@ def pipeline_1( def response_generator(is_multipart): for file in files: - get_validated_mimetype(file) + file_content_type = get_validated_mimetype(file) + + if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) _file = file.file diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_4.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_4.py index 0fdb752..151834f 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_4.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_4.py @@ -197,6 +197,12 @@ def response_generator(is_multipart): for file in files: file_content_type = get_validated_mimetype(file) + if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + _file = file.file response = pipeline_api( diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_5.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_5.py index a02d4bf..b6d9bb3 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_5.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_5.py @@ -200,6 +200,12 @@ def response_generator(is_multipart): for file in files: file_content_type = get_validated_mimetype(file) + if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + _file = file.file response = pipeline_api( diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_1.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_1.py index 728ef5f..f33f7fb 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_1.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_1.py @@ -160,7 +160,13 @@ def pipeline_1( def response_generator(is_multipart): for file in text_files: - get_validated_mimetype(file) + file_content_type = get_validated_mimetype(file) + + if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) text = file.file.read().decode("utf-8") diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_2.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_2.py index 8b04ab0..1deab0a 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_2.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_2.py @@ -161,7 +161,13 @@ def pipeline_1( def response_generator(is_multipart): for file in text_files: - get_validated_mimetype(file) + file_content_type = get_validated_mimetype(file) + + if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) text = file.file.read().decode("utf-8") diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_3.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_3.py index 2897630..4ebb2e7 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_3.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_3.py @@ -175,7 +175,13 @@ def pipeline_1( def response_generator(is_multipart): for file in text_files: - get_validated_mimetype(file) + file_content_type = get_validated_mimetype(file) + + if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) text = file.file.read().decode("utf-8") diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_4.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_4.py index 5cea38a..4cb3bc3 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_4.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_4.py @@ -184,7 +184,13 @@ def pipeline_1( def response_generator(is_multipart): for file in text_files: - get_validated_mimetype(file) + file_content_type = get_validated_mimetype(file) + + if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) text = file.file.read().decode("utf-8") diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_1.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_1.py index 5ff1efa..4c3ff53 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_1.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_1.py @@ -200,6 +200,14 @@ def pipeline_1( def response_generator(is_multipart): for text_file in text_files_list: + file_content_type = get_validated_mimetype(text_file) + + if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + text = text_file.file.read().decode("utf-8") response = pipeline_api( @@ -217,6 +225,12 @@ def response_generator(is_multipart): file_content_type = get_validated_mimetype(file) + if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + response = pipeline_api( text=None, file=_file, diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_2.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_2.py index a5785de..3b1c0c6 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_2.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_2.py @@ -221,6 +221,14 @@ def pipeline_1( def response_generator(is_multipart): for text_file in text_files_list: + file_content_type = get_validated_mimetype(text_file) + + if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + text = text_file.file.read().decode("utf-8") response = pipeline_api( @@ -255,6 +263,12 @@ def response_generator(is_multipart): file_content_type = get_validated_mimetype(file) + if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + response = pipeline_api( text=None, file=_file, diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_3.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_3.py index ca5caea..1f9691a 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_3.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_3.py @@ -223,6 +223,14 @@ def pipeline_1( def response_generator(is_multipart): for text_file in text_files_list: + file_content_type = get_validated_mimetype(text_file) + + if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + text = text_file.file.read().decode("utf-8") response = pipeline_api( @@ -257,6 +265,12 @@ def response_generator(is_multipart): file_content_type = get_validated_mimetype(file) + if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + response = pipeline_api( text=None, file=_file, diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_4.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_4.py index 2b5ff7e..ef15d83 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_4.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_4.py @@ -229,6 +229,14 @@ def pipeline_1( def response_generator(is_multipart): for text_file in text_files_list: + file_content_type = get_validated_mimetype(text_file) + + if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + text = text_file.file.read().decode("utf-8") response = pipeline_api( @@ -265,6 +273,12 @@ def response_generator(is_multipart): file_content_type = get_validated_mimetype(file) + if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + response = pipeline_api( text=None, file=_file, diff --git a/unstructured_api_tools/pipelines/templates/pipeline_api.txt b/unstructured_api_tools/pipelines/templates/pipeline_api.txt index fc921fd..4d755af 100644 --- a/unstructured_api_tools/pipelines/templates/pipeline_api.txt +++ b/unstructured_api_tools/pipelines/templates/pipeline_api.txt @@ -201,6 +201,14 @@ gz_uncompressed_content_type: Optional[str] = Form(default=None), ) def response_generator(is_multipart): for text_file in text_files_list: + file_content_type = get_validated_mimetype(text_file) + + if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST + ) + text = text_file.file.read().decode("utf-8") response = pipeline_api( @@ -240,11 +248,13 @@ gz_uncompressed_content_type: Optional[str] = Form(default=None), for file in files_list: _file = file.file - {% if "file_content_type" in optional_param_value_map %} file_content_type = get_validated_mimetype(file) - {% else %} - get_validated_mimetype(file) - {% endif %} + + if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST + ) response = pipeline_api( text=None, @@ -306,15 +316,24 @@ gz_uncompressed_content_type: Optional[str] = Form(default=None), def response_generator(is_multipart): for file in {{var_name}}: - {% if "file_content_type" in optional_param_value_map %} file_content_type = get_validated_mimetype(file) - {% else %} - get_validated_mimetype(file) - {% endif %} + {% if accepts_text %} + if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST + ) + text = file.file.read().decode("utf-8") {% elif accepts_file %} + if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST + ) + _file = file.file {% endif %} From 7f298e082add5e3fd9c58663011000f877788a12 Mon Sep 17 00:00:00 2001 From: kravetsmic Date: Mon, 22 May 2023 13:19:59 +0300 Subject: [PATCH 2/4] chore: bump version --- CHANGELOG.md | 4 ++++ unstructured_api_tools/__version__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 854a602..9fff13b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# 0.10.6 + +* Refinement around handling of mixed text file / non-text file requests + # 0.10.5 * Add optional CORS to api diff --git a/unstructured_api_tools/__version__.py b/unstructured_api_tools/__version__.py index 3e778da..b3f2482 100644 --- a/unstructured_api_tools/__version__.py +++ b/unstructured_api_tools/__version__.py @@ -1 +1 @@ -__version__ = "0.10.5" # pragma: no cover +__version__ = "0.10.6" # pragma: no cover From 6776bad360b362d275e79d237cffa0a5c51c34d1 Mon Sep 17 00:00:00 2001 From: kravetsmic Date: Mon, 22 May 2023 13:29:19 +0300 Subject: [PATCH 3/4] chore(tests): format code in tests --- test_unstructured_api_tools/api/test_file_apis.py | 6 +++--- .../api/test_file_text_apis.py | 6 +++--- test_unstructured_api_tools/api/test_text_apis.py | 12 ++++++++++-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/test_unstructured_api_tools/api/test_file_apis.py b/test_unstructured_api_tools/api/test_file_apis.py index 3e0ea8b..782f8b1 100644 --- a/test_unstructured_api_tools/api/test_file_apis.py +++ b/test_unstructured_api_tools/api/test_file_apis.py @@ -492,7 +492,7 @@ def test_process_file_2( RESPONSE_SCHEMA_LABELSTUDIO, 400, False, - None, + None, None, ), ], @@ -707,7 +707,7 @@ def test_process_file_3( 400, None, False, - None, + None, ), ( [FILE_TXT_1], @@ -1050,7 +1050,7 @@ def test_process_file_4( 400, False, None, - None + None, ), ( [FILE_TXT_1], diff --git a/test_unstructured_api_tools/api/test_file_text_apis.py b/test_unstructured_api_tools/api/test_file_text_apis.py index 0b760ba..566087a 100644 --- a/test_unstructured_api_tools/api/test_file_text_apis.py +++ b/test_unstructured_api_tools/api/test_file_text_apis.py @@ -339,7 +339,7 @@ def _json_for_one_file(test_file=None, test_text_file=None): ([GZIP_FILE_IMAGE], [], 200, JSON, False, None, FILENAME_FORMATS[FILE_IMAGE]), ([FILE_TXT_1], [], 400, JSON, False, None, None), ([], [FILE_DOCX], 400, JSON, False, None, None), - ([FILE_DOCX, FILE_IMAGE, FILE_MARKDOWN], [FILE_TXT_1], 400, JSON, False, None, None) + ([FILE_DOCX, FILE_IMAGE, FILE_MARKDOWN], [FILE_TXT_1], 400, JSON, False, None, None), ], ) def test_process_file_text_1( @@ -488,7 +488,7 @@ def test_process_file_text_1( 400, False, None, - None + None, ), ( [GZIP_FILE_DOCX], @@ -507,7 +507,7 @@ def test_process_file_text_1( P_INPUT_2_EMPTY, 200, True, - None, + None, None, ), ( diff --git a/test_unstructured_api_tools/api/test_text_apis.py b/test_unstructured_api_tools/api/test_text_apis.py index 685d1e3..9a0c6fa 100644 --- a/test_unstructured_api_tools/api/test_text_apis.py +++ b/test_unstructured_api_tools/api/test_text_apis.py @@ -378,7 +378,7 @@ def test_process_text_1( None, JSON, None, - ) + ), ], ) def test_process_text_2( @@ -563,7 +563,15 @@ def test_process_text_3( ([FILE_DOCX], JSON, RESPONSE_SCHEMA_ISD, 400, False, None, None), ([FILE_TXT_1, FILE_DOCX], JSON, RESPONSE_SCHEMA_ISD, 400, False, None, None), ([GZIP_FILE_DOCX], JSON, RESPONSE_SCHEMA_LABELSTUDIO, 400, False, None, None), - ([FILE_TXT_1, GZIP_FILE_TXT_1, GZIP_FILE_DOCX], JSON, RESPONSE_SCHEMA_ISD, 400, False, None, None) + ( + [FILE_TXT_1, GZIP_FILE_TXT_1, GZIP_FILE_DOCX], + JSON, + RESPONSE_SCHEMA_ISD, + 400, + False, + None, + None, + ), ], ) def test_process_text_4( From f0c476e67693540790546387dc93a01f4d6ba8ec Mon Sep 17 00:00:00 2001 From: kravetsmic Date: Tue, 23 May 2023 12:12:17 +0300 Subject: [PATCH 4/4] refactor(api): removed enumeration mimetypes --- .../prepline_test_project/api/process_file_1.py | 2 +- .../prepline_test_project/api/process_file_2.py | 2 +- .../prepline_test_project/api/process_file_3.py | 2 +- .../prepline_test_project/api/process_file_4.py | 2 +- .../prepline_test_project/api/process_file_5.py | 2 +- .../prepline_test_project/api/process_text_1.py | 2 +- .../prepline_test_project/api/process_text_2.py | 2 +- .../prepline_test_project/api/process_text_3.py | 2 +- .../prepline_test_project/api/process_text_4.py | 2 +- .../prepline_test_project/api/process_text_file_1.py | 4 ++-- .../prepline_test_project/api/process_text_file_2.py | 4 ++-- .../prepline_test_project/api/process_text_file_3.py | 4 ++-- .../prepline_test_project/api/process_text_file_4.py | 4 ++-- .../pipelines/templates/pipeline_api.txt | 8 ++++---- 14 files changed, 21 insertions(+), 21 deletions(-) diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_1.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_1.py index 095e373..c0f22d6 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_1.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_1.py @@ -174,7 +174,7 @@ def response_generator(is_multipart): for file in files: file_content_type = get_validated_mimetype(file) - if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + if file_content_type.startswith("text/"): raise HTTPException( detail=f"Type {file_content_type} not supported for file endpoint.", status_code=status.HTTP_400_BAD_REQUEST, diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_2.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_2.py index e3dee1b..5675503 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_2.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_2.py @@ -161,7 +161,7 @@ def response_generator(is_multipart): for file in files: file_content_type = get_validated_mimetype(file) - if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + if file_content_type.startswith("text/"): raise HTTPException( detail=f"Type {file_content_type} not supported for file endpoint.", status_code=status.HTTP_400_BAD_REQUEST, diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_3.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_3.py index dd18461..d11f103 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_3.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_3.py @@ -184,7 +184,7 @@ def response_generator(is_multipart): for file in files: file_content_type = get_validated_mimetype(file) - if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + if file_content_type.startswith("text/"): raise HTTPException( detail=f"Type {file_content_type} not supported for file endpoint.", status_code=status.HTTP_400_BAD_REQUEST, diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_4.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_4.py index 151834f..ff674c6 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_4.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_4.py @@ -197,7 +197,7 @@ def response_generator(is_multipart): for file in files: file_content_type = get_validated_mimetype(file) - if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + if file_content_type.startswith("text/"): raise HTTPException( detail=f"Type {file_content_type} not supported for file endpoint.", status_code=status.HTTP_400_BAD_REQUEST, diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_5.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_5.py index b6d9bb3..35fc2c4 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_5.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_5.py @@ -200,7 +200,7 @@ def response_generator(is_multipart): for file in files: file_content_type = get_validated_mimetype(file) - if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + if file_content_type.startswith("text/"): raise HTTPException( detail=f"Type {file_content_type} not supported for file endpoint.", status_code=status.HTTP_400_BAD_REQUEST, diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_1.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_1.py index f33f7fb..361771a 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_1.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_1.py @@ -162,7 +162,7 @@ def response_generator(is_multipart): for file in text_files: file_content_type = get_validated_mimetype(file) - if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + if not file_content_type.startswith("text/"): raise HTTPException( detail=f"Type {file_content_type} not supported for text files endpoint.", status_code=status.HTTP_400_BAD_REQUEST, diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_2.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_2.py index 1deab0a..8001d96 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_2.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_2.py @@ -163,7 +163,7 @@ def response_generator(is_multipart): for file in text_files: file_content_type = get_validated_mimetype(file) - if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + if not file_content_type.startswith("text/"): raise HTTPException( detail=f"Type {file_content_type} not supported for text files endpoint.", status_code=status.HTTP_400_BAD_REQUEST, diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_3.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_3.py index 4ebb2e7..6483dc0 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_3.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_3.py @@ -177,7 +177,7 @@ def response_generator(is_multipart): for file in text_files: file_content_type = get_validated_mimetype(file) - if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + if not file_content_type.startswith("text/"): raise HTTPException( detail=f"Type {file_content_type} not supported for text files endpoint.", status_code=status.HTTP_400_BAD_REQUEST, diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_4.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_4.py index 4cb3bc3..6ffa2f9 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_4.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_4.py @@ -186,7 +186,7 @@ def response_generator(is_multipart): for file in text_files: file_content_type = get_validated_mimetype(file) - if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + if not file_content_type.startswith("text/"): raise HTTPException( detail=f"Type {file_content_type} not supported for text files endpoint.", status_code=status.HTTP_400_BAD_REQUEST, diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_1.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_1.py index 4c3ff53..04118c8 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_1.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_1.py @@ -202,7 +202,7 @@ def response_generator(is_multipart): for text_file in text_files_list: file_content_type = get_validated_mimetype(text_file) - if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + if not file_content_type.startswith("text/"): raise HTTPException( detail=f"Type {file_content_type} not supported for text files endpoint.", status_code=status.HTTP_400_BAD_REQUEST, @@ -225,7 +225,7 @@ def response_generator(is_multipart): file_content_type = get_validated_mimetype(file) - if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + if file_content_type.startswith("text"): raise HTTPException( detail=f"Type {file_content_type} not supported for file endpoint.", status_code=status.HTTP_400_BAD_REQUEST, diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_2.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_2.py index 3b1c0c6..4f44670 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_2.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_2.py @@ -223,7 +223,7 @@ def response_generator(is_multipart): for text_file in text_files_list: file_content_type = get_validated_mimetype(text_file) - if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + if not file_content_type.startswith("text/"): raise HTTPException( detail=f"Type {file_content_type} not supported for text files endpoint.", status_code=status.HTTP_400_BAD_REQUEST, @@ -263,7 +263,7 @@ def response_generator(is_multipart): file_content_type = get_validated_mimetype(file) - if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + if file_content_type.startswith("text"): raise HTTPException( detail=f"Type {file_content_type} not supported for file endpoint.", status_code=status.HTTP_400_BAD_REQUEST, diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_3.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_3.py index 1f9691a..ca1a102 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_3.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_3.py @@ -225,7 +225,7 @@ def response_generator(is_multipart): for text_file in text_files_list: file_content_type = get_validated_mimetype(text_file) - if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + if not file_content_type.startswith("text/"): raise HTTPException( detail=f"Type {file_content_type} not supported for text files endpoint.", status_code=status.HTTP_400_BAD_REQUEST, @@ -265,7 +265,7 @@ def response_generator(is_multipart): file_content_type = get_validated_mimetype(file) - if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + if file_content_type.startswith("text"): raise HTTPException( detail=f"Type {file_content_type} not supported for file endpoint.", status_code=status.HTTP_400_BAD_REQUEST, diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_4.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_4.py index ef15d83..1105b73 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_4.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_4.py @@ -231,7 +231,7 @@ def response_generator(is_multipart): for text_file in text_files_list: file_content_type = get_validated_mimetype(text_file) - if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + if not file_content_type.startswith("text/"): raise HTTPException( detail=f"Type {file_content_type} not supported for text files endpoint.", status_code=status.HTTP_400_BAD_REQUEST, @@ -273,7 +273,7 @@ def response_generator(is_multipart): file_content_type = get_validated_mimetype(file) - if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + if file_content_type.startswith("text"): raise HTTPException( detail=f"Type {file_content_type} not supported for file endpoint.", status_code=status.HTTP_400_BAD_REQUEST, diff --git a/unstructured_api_tools/pipelines/templates/pipeline_api.txt b/unstructured_api_tools/pipelines/templates/pipeline_api.txt index 4d755af..030c4d2 100644 --- a/unstructured_api_tools/pipelines/templates/pipeline_api.txt +++ b/unstructured_api_tools/pipelines/templates/pipeline_api.txt @@ -203,7 +203,7 @@ gz_uncompressed_content_type: Optional[str] = Form(default=None), for text_file in text_files_list: file_content_type = get_validated_mimetype(text_file) - if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + if not file_content_type.startswith("text/"): raise HTTPException( detail=f"Type {file_content_type} not supported for text files endpoint.", status_code=status.HTTP_400_BAD_REQUEST @@ -250,7 +250,7 @@ gz_uncompressed_content_type: Optional[str] = Form(default=None), file_content_type = get_validated_mimetype(file) - if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + if file_content_type.startswith("text"): raise HTTPException( detail=f"Type {file_content_type} not supported for file endpoint.", status_code=status.HTTP_400_BAD_REQUEST @@ -320,7 +320,7 @@ gz_uncompressed_content_type: Optional[str] = Form(default=None), {% if accepts_text %} - if file_content_type not in ["text/plain", "text/markdown", "text/csv"]: + if not file_content_type.startswith("text/"): raise HTTPException( detail=f"Type {file_content_type} not supported for text files endpoint.", status_code=status.HTTP_400_BAD_REQUEST @@ -328,7 +328,7 @@ gz_uncompressed_content_type: Optional[str] = Form(default=None), text = file.file.read().decode("utf-8") {% elif accepts_file %} - if file_content_type in ["text/plain", "text/markdown", "text/csv"]: + if file_content_type.startswith("text/"): raise HTTPException( detail=f"Type {file_content_type} not supported for file endpoint.", status_code=status.HTTP_400_BAD_REQUEST