This repository was archived by the owner on Sep 22, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathdocs_loader_utils.py
More file actions
133 lines (114 loc) · 4.29 KB
/
docs_loader_utils.py
File metadata and controls
133 lines (114 loc) · 4.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
from typing import List
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, MarkdownTextSplitter
from langchain_community.vectorstores import FAISS
#from langchain_community.vectorstores import Milvus
from langchain_community.document_loaders import (
CSVLoader,
EverNoteLoader,
PDFMinerLoader,
TextLoader,
UnstructuredEPubLoader,
UnstructuredHTMLLoader,
UnstructuredMarkdownLoader,
UnstructuredODTLoader,
UnstructuredPowerPointLoader,
UnstructuredWordDocumentLoader, )
# RAG Docs
TARGET_FOLDER = "./docs/"
TEXT_SPLITERS = {
"Character": CharacterTextSplitter,
"RecursiveCharacter": RecursiveCharacterTextSplitter,
"Markdown": MarkdownTextSplitter,
}
LOADERS = {
".csv": (CSVLoader, {}),
".doc": (UnstructuredWordDocumentLoader, {}),
".docx": (UnstructuredWordDocumentLoader, {}),
".enex": (EverNoteLoader, {}),
".epub": (UnstructuredEPubLoader, {}),
".html": (UnstructuredHTMLLoader, {}),
".md": (UnstructuredMarkdownLoader, {}),
".odt": (UnstructuredODTLoader, {}),
".pdf": (PDFMinerLoader, {}),
".ppt": (UnstructuredPowerPointLoader, {}),
".pptx": (UnstructuredPowerPointLoader, {}),
".txt": (TextLoader, {"encoding": "utf8"}),
}
def load_docs(embeddings, skip_docs_loading):
if not skip_docs_loading:
documents = []
for file_path in os.listdir(TARGET_FOLDER):
if not file_path.endswith('.html'):
continue
abs_path = os.path.join(TARGET_FOLDER, file_path)
print(f"Loading document {abs_path} embedding into vector store...", flush=True)
documents.extend(load_single_document(abs_path))
spliter_name = "RecursiveCharacter" # PARAM
chunk_size=1000 # PARAM
chunk_overlap=200 # PARAM
text_splitter = TEXT_SPLITERS[spliter_name](chunk_size=chunk_size, chunk_overlap=chunk_overlap)
texts = text_splitter.split_documents(documents)
db = FAISS.from_documents(texts, embeddings) # This command populates vector store with embeddings
else:
dimensions: int = len(embeddings.embed_query("get_dims"))
db = FAISS(
embedding_function=embeddings,
index=IndexFlatL2(dimensions),
docstore=InMemoryDocstore(),
index_to_docstore_id={},
)
return db
def load_single_document(file_path: str) -> List[Document]:
"""
helper for loading a single document
Params:
file_path: document path
Returns:
documents loaded
"""
ext = "." + file_path.rsplit(".", 1)[-1]
if ext in LOADERS:
loader_class, loader_args = LOADERS[ext]
loader = loader_class(file_path, **loader_args)
return loader.load()
raise ValueError(f"File does not exist '{ext}'")
def pretty_print_docs(docs):
buff = (
f"\n{'-' * 100}\n".join(
[f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
)
)
print(str(buff)[:500])
def chunk_transcript_docs(docs, chunk_size = 1500):
new_docs = []
text_len_batched = 0
text_ts_start = None
text_ts_end = None
text_batched = ""
num_docs = len(docs) - 1
for i, doc in enumerate(docs):
text = doc.page_content
text_len_batched += len(text)
text_batched += text
if text_ts_start is None:
text_ts_start = doc.metadata["timestamp"]
if (text_len_batched >= chunk_size) or (text_len_batched >0 and i == num_docs):
text_ts_end = doc.metadata["timestamp"]
new_doc = Document(
page_content=text_batched.lstrip(),
metadata={
"start": str(eval(text_ts_start)[0]),
"end": str(eval(text_ts_end)[1])
})
new_docs.append(new_doc)
text_ts_start = None
text_ts_end = None
text_batched = ""
text_len_batched = 0
return new_docs
def format_transcript_docs(docs):
return " ".join(doc.metadata['start'] + "; " + doc.metadata['end'] + "; " + doc.page_content + '\n' for doc in docs)
def format_docs(docs):
return " ".join(doc.page_content for doc in docs)