| 
1 | 1 | # Copyright (C) 2024 Intel Corporation  | 
2 | 2 | # SPDX-License-Identifier: Apache-2.0  | 
3 | 3 | 
 
  | 
4 |  | -import asyncio  | 
5 | 4 | import base64  | 
6 | 5 | import os  | 
7 | 6 | import subprocess  | 
@@ -55,7 +54,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k  | 
55 | 54 |     return inputs  | 
56 | 55 | 
 
  | 
57 | 56 | 
 
  | 
58 |  | -def read_pdf(file):  | 
 | 57 | +def read_pdf(file: str):  | 
59 | 58 |     from langchain.document_loaders import PyPDFLoader  | 
60 | 59 | 
 
  | 
61 | 60 |     loader = PyPDFLoader(file)  | 
@@ -101,29 +100,50 @@ def video2audio(  | 
101 | 100 |     return audio_base64  | 
102 | 101 | 
 
  | 
103 | 102 | 
 
  | 
104 |  | -def read_text_from_file(file, save_file_name):  | 
 | 103 | +async def read_text_from_file(file: UploadFile):  | 
 | 104 | +    ctype = file.headers["content-type"]  | 
 | 105 | +    valid = (  | 
 | 106 | +        "text/plain",  | 
 | 107 | +        "application/pdf",  | 
 | 108 | +        "application/octet-stream",  | 
 | 109 | +        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",  | 
 | 110 | +    )  | 
 | 111 | + | 
 | 112 | +    file_content = None  | 
 | 113 | +    if ctype not in valid:  | 
 | 114 | +        return file_content  | 
 | 115 | + | 
 | 116 | +    import aiofiles  | 
105 | 117 |     import docx2txt  | 
106 | 118 |     from langchain.text_splitter import CharacterTextSplitter  | 
107 | 119 | 
 
  | 
108 | 120 |     # read text file  | 
109 |  | -    if file.headers["content-type"] == "text/plain":  | 
 | 121 | +    if ctype == "text/plain":  | 
110 | 122 |         file.file.seek(0)  | 
111 | 123 |         content = file.file.read().decode("utf-8")  | 
112 |  | -        # Split text  | 
 | 124 | +        # Split text to multiple documents  | 
113 | 125 |         text_splitter = CharacterTextSplitter()  | 
114 |  | -        texts = text_splitter.split_text(content)  | 
115 |  | -        # Create multiple documents  | 
116 |  | -        file_content = texts  | 
117 |  | -    # read pdf file  | 
118 |  | -    elif file.headers["content-type"] == "application/pdf":  | 
119 |  | -        documents = read_pdf(save_file_name)  | 
120 |  | -        file_content = [doc.page_content for doc in documents]  | 
121 |  | -    # read docx file  | 
122 |  | -    elif (  | 
123 |  | -        file.headers["content-type"] == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"  | 
124 |  | -        or file.headers["content-type"] == "application/octet-stream"  | 
125 |  | -    ):  | 
126 |  | -        file_content = docx2txt.process(save_file_name)  | 
 | 126 | +        return text_splitter.split_text(content)  | 
 | 127 | + | 
 | 128 | +    # need a tmp file for rest  | 
 | 129 | +    async with aiofiles.tempfile.NamedTemporaryFile() as tmp:  | 
 | 130 | +        await tmp.write(await file.read())  | 
 | 131 | +        await tmp.flush()  | 
 | 132 | + | 
 | 133 | +        # read pdf file  | 
 | 134 | +        if ctype == "application/pdf":  | 
 | 135 | +            documents = read_pdf(tmp.name)  | 
 | 136 | +            file_content = [doc.page_content for doc in documents]  | 
 | 137 | + | 
 | 138 | +        # read docx file  | 
 | 139 | +        if ctype in (  | 
 | 140 | +            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",  | 
 | 141 | +            "application/octet-stream",  | 
 | 142 | +        ):  | 
 | 143 | +            file_content = docx2txt.process(tmp.name)  | 
 | 144 | + | 
 | 145 | +        # remove temp file  | 
 | 146 | +        await tmp.close()  | 
127 | 147 | 
 
  | 
128 | 148 |     return file_content  | 
129 | 149 | 
 
  | 
@@ -188,25 +208,14 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File(  | 
188 | 208 |             file_summaries = []  | 
189 | 209 |             if files:  | 
190 | 210 |                 for file in files:  | 
191 |  | -                    # Fix concurrency issue with the same file name  | 
192 |  | -                    # https://github.com/opea-project/GenAIExamples/issues/1279  | 
193 |  | -                    uid = str(uuid.uuid4())  | 
194 |  | -                    file_path = f"/tmp/{uid}"  | 
195 |  | - | 
196 | 211 |                     if data_type is not None and data_type in ["audio", "video"]:  | 
197 | 212 |                         raise ValueError(  | 
198 | 213 |                             "Audio and Video file uploads are not supported in docsum with curl request, \  | 
199 | 214 |                                 please use the UI or pass base64 string of the content directly."  | 
200 | 215 |                         )  | 
201 | 216 | 
 
  | 
202 | 217 |                     else:  | 
203 |  | -                        import aiofiles  | 
204 |  | - | 
205 |  | -                        async with aiofiles.open(file_path, "wb") as f:  | 
206 |  | -                            await f.write(await file.read())  | 
207 |  | - | 
208 |  | -                        docs = read_text_from_file(file, file_path)  | 
209 |  | -                        os.remove(file_path)  | 
 | 218 | +                        docs = await read_text_from_file(file)  | 
210 | 219 | 
 
  | 
211 | 220 |                         if isinstance(docs, list):  | 
212 | 221 |                             file_summaries.extend(docs)  | 
 | 
0 commit comments