diff --git a/CHANGELOG.md b/CHANGELOG.md index 854a602..9fff13b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# 0.10.6 + +* Refinement around handling of mixed text file / non-text file requests + # 0.10.5 * Add optional CORS to api diff --git a/test_unstructured_api_tools/api/test_file_apis.py b/test_unstructured_api_tools/api/test_file_apis.py index a97bdbf..782f8b1 100644 --- a/test_unstructured_api_tools/api/test_file_apis.py +++ b/test_unstructured_api_tools/api/test_file_apis.py @@ -31,6 +31,8 @@ GZIP_FILE_IMAGE, GZIP_FILE_DOCX, FILE_MARKDOWN, + FILE_TXT_1, + GZIP_FILE_TXT_1, ) # accepts: files, input2 @@ -224,6 +226,9 @@ def _json_for_one_file(test_file): ([], P_INPUT_1_EMPTY, JSON, 400, None), ([GZIP_FILE_DOCX], P_INPUT_1_EMPTY, JSON, 200, FILENAME_FORMATS[FILE_DOCX]), ([GZIP_FILE_DOCX], P_INPUT_1_EMPTY, JSON, 200, FILENAME_FORMATS[FILE_IMAGE]), + ([FILE_TXT_1], P_INPUT_1_EMPTY, JSON, 400, None), + ([FILE_DOCX, FILE_IMAGE, FILE_TXT_1], P_INPUT_1_EMPTY, JSON, 400, None), + ([FILE_DOCX, GZIP_FILE_TXT_1], P_INPUT_1_EMPTY, JSON, 400, None), ], ) def test_process_file_1( @@ -235,6 +240,8 @@ def test_process_file_1( data = test_params if gz_content_type: data["gz_uncompressed_content_type"] = gz_content_type + else: + data["gz_uncompressed_content_type"] = None response = client.post( endpoint, @@ -270,13 +277,15 @@ def test_process_file_1( ([FILE_DOCX, GZIP_FILE_IMAGE], MIXED, 200, None, False, None), ([GZIP_FILE_DOCX, GZIP_FILE_IMAGE], MIXED, 200, None, False, None), ([GZIP_FILE_DOCX, GZIP_FILE_IMAGE], TEXT_CSV, 406, None, False, None), - ([FILE_MARKDOWN, GZIP_FILE_IMAGE], JSON, 200, None, False, None), - ([FILE_MARKDOWN], JSON, 200, None, False, None), - ([FILE_MARKDOWN], JSON, 200, None, True, None), + ([FILE_MARKDOWN, GZIP_FILE_IMAGE], JSON, 400, None, False, None), + ([FILE_MARKDOWN], JSON, 400, None, False, None), + ([FILE_MARKDOWN], JSON, 400, None, True, None), ([FILE_MSG], JSON, 200, None, True, None), ([FILE_JSON], JSON, 200, None, True, None), ([GZIP_FILE_DOCX], JSON, 200, None, False, FILENAME_FORMATS[FILE_DOCX]), ([GZIP_FILE_DOCX], JSON, 200, None, False, FILENAME_FORMATS[FILE_IMAGE]), + ([GZIP_FILE_DOCX, GZIP_FILE_TXT_1], JSON, 400, None, False, None), + ([FILE_TXT_1], JSON, 400, None, False, None), ], ) def test_process_file_2( @@ -421,12 +430,12 @@ def test_process_file_2( None, None, ), - ([FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_LABELSTUDIO, 200, True, None, None), + ([FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_LABELSTUDIO, 400, True, None, None), ( [FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_LABELSTUDIO, - 200, + 400, False, FILENAME_FORMATS[FILE_MARKDOWN], None, @@ -459,6 +468,33 @@ def test_process_file_2( None, FILENAME_FORMATS[FILE_IMAGE], ), + ( + [GZIP_FILE_TXT_1], + JSON, + RESPONSE_SCHEMA_ISD, + 400, + False, + None, + None, + ), + ( + [FILE_DOCX, FILE_TXT_1], + JSON, + RESPONSE_SCHEMA_LABELSTUDIO, + 400, + False, + None, + None, + ), + ( + [FILE_TXT_1], + JSON, + RESPONSE_SCHEMA_LABELSTUDIO, + 400, + False, + None, + None, + ), ], ) def test_process_file_3( @@ -630,7 +666,7 @@ def test_process_file_3( False, None, ), - ([FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_ISD, P_INPUT_1_SINGLE, 200, None, True, None), + ([FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_ISD, P_INPUT_1_SINGLE, 400, None, True, None), ( [GZIP_FILE_DOCX, GZIP_FILE_IMAGE], MIXED, @@ -663,6 +699,36 @@ def test_process_file_3( False, FILENAME_FORMATS[FILE_IMAGE], ), + ( + [GZIP_FILE_TXT_1], + JSON, + RESPONSE_SCHEMA_ISD, + P_INPUT_1_EMPTY, + 400, + None, + False, + None, + ), + ( + [FILE_TXT_1], + JSON, + RESPONSE_SCHEMA_LABELSTUDIO, + P_INPUT_1_EMPTY, + 400, + None, + False, + None, + ), + ( + [FILE_DOCX, FILE_MARKDOWN], + JSON, + RESPONSE_SCHEMA_ISD, + P_INPUT_1_EMPTY, + 400, + None, + False, + None, + ), ], ) def test_process_file_4( @@ -871,7 +937,7 @@ def test_process_file_4( RESPONSE_SCHEMA_LABELSTUDIO, P_INPUT_1_MULTI, P_INPUT_2_EMPTY, - 200, + 400, False, None, None, @@ -882,7 +948,7 @@ def test_process_file_4( RESPONSE_SCHEMA_LABELSTUDIO, P_INPUT_1_MULTI, P_INPUT_2_EMPTY, - 200, + 400, True, None, None, @@ -893,7 +959,7 @@ def test_process_file_4( RESPONSE_SCHEMA_LABELSTUDIO, P_INPUT_1_MULTI, P_INPUT_2_EMPTY, - 200, + 400, False, FILENAME_FORMATS[FILE_MARKDOWN], None, @@ -975,6 +1041,28 @@ def test_process_file_4( None, FILENAME_FORMATS[FILE_IMAGE], ), + ( + [GZIP_FILE_DOCX, FILE_TXT_1], + JSON, + RESPONSE_SCHEMA_ISD, + P_INPUT_1_EMPTY, + P_INPUT_2_EMPTY, + 400, + False, + None, + None, + ), + ( + [FILE_TXT_1], + JSON, + RESPONSE_SCHEMA_LABELSTUDIO, + P_INPUT_1_EMPTY, + P_INPUT_2_EMPTY, + 400, + False, + None, + None, + ), ], ) def test_process_file_5( diff --git a/test_unstructured_api_tools/api/test_file_text_apis.py b/test_unstructured_api_tools/api/test_file_text_apis.py index 2c8eda8..566087a 100644 --- a/test_unstructured_api_tools/api/test_file_text_apis.py +++ b/test_unstructured_api_tools/api/test_file_text_apis.py @@ -311,18 +311,18 @@ def _json_for_one_file(test_file=None, test_text_file=None): ([FILE_DOCX, FILE_IMAGE], [GZIP_FILE_TXT_1, GZIP_FILE_TXT_2], 200, JSON, False, None, None), ([FILE_DOCX], [GZIP_FILE_TXT_2], 200, JSON, False, None, None), ([GZIP_FILE_IMAGE], [GZIP_FILE_TXT_1], 200, JSON, False, None, None), - ([FILE_MARKDOWN], [GZIP_FILE_TXT_1], 200, JSON, True, None, None), + ([FILE_MARKDOWN], [GZIP_FILE_TXT_1], 400, JSON, True, None, None), ( [FILE_MARKDOWN], [GZIP_FILE_TXT_1], - 200, + 400, JSON, False, f"{FILENAME_FORMATS[FILE_MARKDOWN]},{FILENAME_FORMATS[FILE_TXT_1]}", None, ), ([FILE_MARKDOWN], [GZIP_FILE_TXT_1], 400, JSON, False, FILENAME_FORMATS[FILE_TXT_1], None), - ([FILE_MARKDOWN, FILE_DOCX], [GZIP_FILE_TXT_1, FILE_TXT_2], 200, MIXED, False, None, None), + ([FILE_DOCX], [GZIP_FILE_TXT_1, FILE_TXT_2, FILE_MARKDOWN], 200, MIXED, False, None, None), ([], [], 400, JSON, False, None, None), ( [FILE_MARKDOWN, FILE_DOCX], @@ -337,6 +337,9 @@ def _json_for_one_file(test_file=None, test_text_file=None): ([FILE_DOCX], [], 200, JSON, False, None, None), ([GZIP_FILE_DOCX], [FILE_TXT_1], 200, JSON, False, None, FILENAME_FORMATS[FILE_DOCX]), ([GZIP_FILE_IMAGE], [], 200, JSON, False, None, FILENAME_FORMATS[FILE_IMAGE]), + ([FILE_TXT_1], [], 400, JSON, False, None, None), + ([], [FILE_DOCX], 400, JSON, False, None, None), + ([FILE_DOCX, FILE_IMAGE, FILE_MARKDOWN], [FILE_TXT_1], 400, JSON, False, None, None), ], ) def test_process_file_text_1( @@ -431,10 +434,10 @@ def test_process_file_text_1( ([GZIP_FILE_IMAGE], [GZIP_FILE_TXT_1], JSON, P_INPUT_2_MULTI, 200, False, None, None), ([], [FILE_TXT_1], TEXT_CSV, P_INPUT_2_EMPTY, 406, False, None, None), ([], [FILE_TXT_1], JSON, P_INPUT_2_EMPTY, 200, False, None, None), - ([FILE_MARKDOWN], [FILE_TXT_1], JSON, P_INPUT_2_EMPTY, 200, True, None, None), + ([], [FILE_TXT_1, FILE_MARKDOWN], JSON, P_INPUT_2_EMPTY, 200, True, None, None), ( - [FILE_MARKDOWN], - [FILE_TXT_1], + [], + [FILE_TXT_1, FILE_MARKDOWN], JSON, P_INPUT_2_MULTI, 200, @@ -443,8 +446,8 @@ def test_process_file_text_1( None, ), ( - [FILE_MARKDOWN], - [FILE_TXT_1], + [], + [FILE_TXT_1, FILE_MARKDOWN], JSON, P_INPUT_2_SINGLE, 400, @@ -453,7 +456,7 @@ def test_process_file_text_1( None, ), ([], [], JSON, P_INPUT_2_EMPTY, 400, False, None, None), - ([FILE_MARKDOWN], [FILE_TXT_1], TEXT_CSV, P_INPUT_2_MULTI, 406, False, None, None), + ([], [FILE_TXT_1, FILE_MARKDOWN], TEXT_CSV, P_INPUT_2_MULTI, 406, False, None, None), ([], [FILE_TXT_1], JSON, P_INPUT_2_SINGLE, 200, False, None, None), ([FILE_DOCX], [], JSON, P_INPUT_2_SINGLE, 200, False, None, None), ([], [FILE_TXT_1], MIXED, P_INPUT_2_EMPTY, 200, False, None, None), @@ -477,6 +480,46 @@ def test_process_file_text_1( None, FILENAME_FORMATS[FILE_IMAGE], ), + ( + [GZIP_FILE_DOCX, GZIP_FILE_TXT_1], + [FILE_TXT_2], + JSON, + P_INPUT_2_SINGLE, + 400, + False, + None, + None, + ), + ( + [GZIP_FILE_DOCX], + [GZIP_FILE_IMAGE], + JSON, + P_INPUT_2_MULTI, + 400, + False, + None, + None, + ), + ( + [], + [FILE_MARKDOWN], + JSON, + P_INPUT_2_EMPTY, + 200, + True, + None, + None, + ), + ( + [], + [], + JSON, + P_INPUT_1_EMPTY, + 400, + False, + None, + None, + ), ], ) def test_process_file_text_2( @@ -645,8 +688,8 @@ def test_process_file_text_2( ([], [GZIP_FILE_TXT_1], TEXT_CSV, RESPONSE_SCHEMA_LABELSTUDIO, 406, False, None, None), ([], [GZIP_FILE_TXT_1], JSON, RESPONSE_SCHEMA_LABELSTUDIO, 200, False, None, None), ( - [FILE_DOCX, FILE_MARKDOWN], - [GZIP_FILE_TXT_1], + [FILE_DOCX], + [GZIP_FILE_TXT_1, FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_ISD, 200, @@ -665,8 +708,8 @@ def test_process_file_text_2( None, ), ( - [FILE_DOCX, FILE_MARKDOWN], - [GZIP_FILE_TXT_1], + [FILE_DOCX], + [GZIP_FILE_TXT_1, FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_ISD, 400, @@ -985,8 +1028,8 @@ def test_process_file_text_3( None, ), ( - [FILE_MARKDOWN], - [FILE_TXT_2], + [], + [FILE_TXT_2, FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_ISD, P_INPUT_1_MULTI, @@ -997,8 +1040,8 @@ def test_process_file_text_3( None, ), ( - [FILE_MARKDOWN], - [FILE_TXT_2], + [], + [FILE_TXT_2, FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_ISD, P_INPUT_1_MULTI, @@ -1009,8 +1052,8 @@ def test_process_file_text_3( None, ), ( - [FILE_MARKDOWN], - [FILE_TXT_2], + [], + [FILE_TXT_2, FILE_MARKDOWN], JSON, RESPONSE_SCHEMA_ISD, P_INPUT_1_MULTI, diff --git a/test_unstructured_api_tools/api/test_text_apis.py b/test_unstructured_api_tools/api/test_text_apis.py index cf64307..9a0c6fa 100644 --- a/test_unstructured_api_tools/api/test_text_apis.py +++ b/test_unstructured_api_tools/api/test_text_apis.py @@ -27,6 +27,8 @@ GZIP_FILE_TXT_2, FILE_MARKDOWN, FILENAME_FORMATS, + FILE_DOCX, + GZIP_FILE_DOCX, ) # accepts: text files @@ -164,6 +166,9 @@ def _json_for_one_file(test_file): ([], 400, False, None, JSON, None), ([GZIP_FILE_TXT_1], 200, False, None, JSON, None), ([GZIP_FILE_TXT_1], 200, False, None, JSON, FILENAME_FORMATS[FILE_TXT_1]), + ([FILE_DOCX], 400, False, None, JSON, None), + ([GZIP_FILE_DOCX], 400, False, None, JSON, None), + ([FILE_TXT_1, FILE_DOCX], 400, False, None, JSON, None), ], ) def test_process_text_1( @@ -344,6 +349,36 @@ def test_process_text_1( JSON, FILENAME_FORMATS[FILE_TXT_1], ), + ( + [FILE_DOCX], + P_INPUT_1_EMPTY, + P_INPUT_2_EMPTY, + 400, + False, + None, + JSON, + None, + ), + ( + [GZIP_FILE_DOCX], + P_INPUT_1_EMPTY, + P_INPUT_2_EMPTY, + 400, + False, + None, + JSON, + None, + ), + ( + [FILE_TXT_1, GZIP_FILE_DOCX], + P_INPUT_1_EMPTY, + P_INPUT_2_EMPTY, + 400, + False, + None, + JSON, + None, + ), ], ) def test_process_text_2( @@ -434,6 +469,9 @@ def test_process_text_2( ([FILE_TXT_2, FILE_MARKDOWN], None, 406, False, FILENAME_FORMATS[FILE_TXT_1], None), ([], None, 400, False, None, None), ([GZIP_FILE_TXT_1], JSON, 200, False, None, FILENAME_FORMATS[FILE_TXT_1]), + ([FILE_DOCX], JSON, 400, False, None, None), + ([FILE_TXT_1, GZIP_FILE_DOCX], JSON, 400, False, None, None), + ([FILE_TXT_1, FILE_TXT_1, FILE_DOCX], JSON, 400, False, None, None), ], ) def test_process_text_3( @@ -522,6 +560,18 @@ def test_process_text_3( None, FILENAME_FORMATS[FILE_TXT_1], ), + ([FILE_DOCX], JSON, RESPONSE_SCHEMA_ISD, 400, False, None, None), + ([FILE_TXT_1, FILE_DOCX], JSON, RESPONSE_SCHEMA_ISD, 400, False, None, None), + ([GZIP_FILE_DOCX], JSON, RESPONSE_SCHEMA_LABELSTUDIO, 400, False, None, None), + ( + [FILE_TXT_1, GZIP_FILE_TXT_1, GZIP_FILE_DOCX], + JSON, + RESPONSE_SCHEMA_ISD, + 400, + False, + None, + None, + ), ], ) def test_process_text_4( diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_1.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_1.py index b982ef6..c0f22d6 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_1.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_1.py @@ -174,6 +174,12 @@ def response_generator(is_multipart): for file in files: file_content_type = get_validated_mimetype(file) + if file_content_type.startswith("text/"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + _file = file.file response = pipeline_api( diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_2.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_2.py index 61475e3..5675503 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_2.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_2.py @@ -159,7 +159,13 @@ def pipeline_1( def response_generator(is_multipart): for file in files: - get_validated_mimetype(file) + file_content_type = get_validated_mimetype(file) + + if file_content_type.startswith("text/"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) _file = file.file diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_3.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_3.py index 7408d07..d11f103 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_3.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_3.py @@ -182,7 +182,13 @@ def pipeline_1( def response_generator(is_multipart): for file in files: - get_validated_mimetype(file) + file_content_type = get_validated_mimetype(file) + + if file_content_type.startswith("text/"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) _file = file.file diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_4.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_4.py index 0fdb752..ff674c6 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_4.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_4.py @@ -197,6 +197,12 @@ def response_generator(is_multipart): for file in files: file_content_type = get_validated_mimetype(file) + if file_content_type.startswith("text/"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + _file = file.file response = pipeline_api( diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_5.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_5.py index a02d4bf..35fc2c4 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_5.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_5.py @@ -200,6 +200,12 @@ def response_generator(is_multipart): for file in files: file_content_type = get_validated_mimetype(file) + if file_content_type.startswith("text/"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + _file = file.file response = pipeline_api( diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_1.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_1.py index 728ef5f..361771a 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_1.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_1.py @@ -160,7 +160,13 @@ def pipeline_1( def response_generator(is_multipart): for file in text_files: - get_validated_mimetype(file) + file_content_type = get_validated_mimetype(file) + + if not file_content_type.startswith("text/"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) text = file.file.read().decode("utf-8") diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_2.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_2.py index 8b04ab0..8001d96 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_2.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_2.py @@ -161,7 +161,13 @@ def pipeline_1( def response_generator(is_multipart): for file in text_files: - get_validated_mimetype(file) + file_content_type = get_validated_mimetype(file) + + if not file_content_type.startswith("text/"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) text = file.file.read().decode("utf-8") diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_3.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_3.py index 2897630..6483dc0 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_3.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_3.py @@ -175,7 +175,13 @@ def pipeline_1( def response_generator(is_multipart): for file in text_files: - get_validated_mimetype(file) + file_content_type = get_validated_mimetype(file) + + if not file_content_type.startswith("text/"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) text = file.file.read().decode("utf-8") diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_4.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_4.py index 5cea38a..6ffa2f9 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_4.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_4.py @@ -184,7 +184,13 @@ def pipeline_1( def response_generator(is_multipart): for file in text_files: - get_validated_mimetype(file) + file_content_type = get_validated_mimetype(file) + + if not file_content_type.startswith("text/"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) text = file.file.read().decode("utf-8") diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_1.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_1.py index 5ff1efa..04118c8 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_1.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_1.py @@ -200,6 +200,14 @@ def pipeline_1( def response_generator(is_multipart): for text_file in text_files_list: + file_content_type = get_validated_mimetype(text_file) + + if not file_content_type.startswith("text/"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + text = text_file.file.read().decode("utf-8") response = pipeline_api( @@ -217,6 +225,12 @@ def response_generator(is_multipart): file_content_type = get_validated_mimetype(file) + if file_content_type.startswith("text"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + response = pipeline_api( text=None, file=_file, diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_2.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_2.py index a5785de..4f44670 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_2.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_2.py @@ -221,6 +221,14 @@ def pipeline_1( def response_generator(is_multipart): for text_file in text_files_list: + file_content_type = get_validated_mimetype(text_file) + + if not file_content_type.startswith("text/"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + text = text_file.file.read().decode("utf-8") response = pipeline_api( @@ -255,6 +263,12 @@ def response_generator(is_multipart): file_content_type = get_validated_mimetype(file) + if file_content_type.startswith("text"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + response = pipeline_api( text=None, file=_file, diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_3.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_3.py index ca5caea..ca1a102 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_3.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_3.py @@ -223,6 +223,14 @@ def pipeline_1( def response_generator(is_multipart): for text_file in text_files_list: + file_content_type = get_validated_mimetype(text_file) + + if not file_content_type.startswith("text/"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + text = text_file.file.read().decode("utf-8") response = pipeline_api( @@ -257,6 +265,12 @@ def response_generator(is_multipart): file_content_type = get_validated_mimetype(file) + if file_content_type.startswith("text"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + response = pipeline_api( text=None, file=_file, diff --git a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_4.py b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_4.py index 2b5ff7e..1105b73 100644 --- a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_4.py +++ b/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_4.py @@ -229,6 +229,14 @@ def pipeline_1( def response_generator(is_multipart): for text_file in text_files_list: + file_content_type = get_validated_mimetype(text_file) + + if not file_content_type.startswith("text/"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + text = text_file.file.read().decode("utf-8") response = pipeline_api( @@ -265,6 +273,12 @@ def response_generator(is_multipart): file_content_type = get_validated_mimetype(file) + if file_content_type.startswith("text"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST, + ) + response = pipeline_api( text=None, file=_file, diff --git a/unstructured_api_tools/__version__.py b/unstructured_api_tools/__version__.py index 3e778da..b3f2482 100644 --- a/unstructured_api_tools/__version__.py +++ b/unstructured_api_tools/__version__.py @@ -1 +1 @@ -__version__ = "0.10.5" # pragma: no cover +__version__ = "0.10.6" # pragma: no cover diff --git a/unstructured_api_tools/pipelines/templates/pipeline_api.txt b/unstructured_api_tools/pipelines/templates/pipeline_api.txt index fc921fd..030c4d2 100644 --- a/unstructured_api_tools/pipelines/templates/pipeline_api.txt +++ b/unstructured_api_tools/pipelines/templates/pipeline_api.txt @@ -201,6 +201,14 @@ gz_uncompressed_content_type: Optional[str] = Form(default=None), ) def response_generator(is_multipart): for text_file in text_files_list: + file_content_type = get_validated_mimetype(text_file) + + if not file_content_type.startswith("text/"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST + ) + text = text_file.file.read().decode("utf-8") response = pipeline_api( @@ -240,11 +248,13 @@ gz_uncompressed_content_type: Optional[str] = Form(default=None), for file in files_list: _file = file.file - {% if "file_content_type" in optional_param_value_map %} file_content_type = get_validated_mimetype(file) - {% else %} - get_validated_mimetype(file) - {% endif %} + + if file_content_type.startswith("text"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST + ) response = pipeline_api( text=None, @@ -306,15 +316,24 @@ gz_uncompressed_content_type: Optional[str] = Form(default=None), def response_generator(is_multipart): for file in {{var_name}}: - {% if "file_content_type" in optional_param_value_map %} file_content_type = get_validated_mimetype(file) - {% else %} - get_validated_mimetype(file) - {% endif %} + {% if accepts_text %} + if not file_content_type.startswith("text/"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for text files endpoint.", + status_code=status.HTTP_400_BAD_REQUEST + ) + text = file.file.read().decode("utf-8") {% elif accepts_file %} + if file_content_type.startswith("text/"): + raise HTTPException( + detail=f"Type {file_content_type} not supported for file endpoint.", + status_code=status.HTTP_400_BAD_REQUEST + ) + _file = file.file {% endif %}