rllm-team · MaoPopovich · Jun 14, 2024 · Jun 14, 2024 · Jun 16, 2024 · Jun 16, 2024
diff --git a/README.md b/README.md
@@ -22,7 +22,31 @@
 ## LLM models
 
 - We recommmend 4-bit quantized Gemma 2b model, which can be Downloaded from [HuggingFace](https://huggingface.co/lmstudio-ai/gemma-2b-it-GGUF/blob/main/gemma-2b-it-q4_k_m.gguf).
+- In practice, the above Gemma 2b model is too weak to generate accurate responses. We use Mistral-7B model from [ollama](https://github.com/ollama/ollama).
 
 ## LM Model
 
 - We recommend a light BERT-like model all-MiniLM-L6-v2 to make sentence embedding, which can be obtained directly from [HuggingFace](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2).
+- The embedding function during constructing the VectorStore database include GPT4AllEmbedding and HuggingFaceEmbedding.
+
+## Retrieval Augmented Generation (RAG)
+- RAG is utilized to refine the generation process for more accurate prediction and less hallucinated results.
+- The implementation code referes the paper [Self-RAG: Learning to Retrieve, Generate and Critique through Self-Reflection](http://arxiv.org/abs/2310.11511)
+- Self-RAG is a strategy for RAG that incorporates self-reflection(grading) on retrieved documents and generations. In the paper, a few decisions are made:
+  - Should I retrieve documents
+    - Input: `x (question)`,  `y (generation)`
+    - Decides when to retrieve `D` chunks with `R`
+    - Output: `{yes, no, continue}`
+  - Are the retrieved passages `D` relevant to the question `x`
+    - Input: `x (question)`, `d(chunk)` for `d` in `D`
+    - `d` provides useful information to solve `x`
+    - output: `{relevant, irrelevant}`
+  - Are the LLM generation from each chunk in D is relevant to the chunk (hallucinations, etc) -
+    - Input: `x (question)`, `d (chunk)`, `y (generation)` for `d` in `D`
+    - All of the verification-worthy statements in y (generation) are supported by d
+    - Output: {fully supported, partially supported, no support
+  - The LLM generation from each chunk in D is a useful response to x (question) -
+    - Input: `x (question)`, `y (generation)` for `d` in `D`
+    - `y (generation)` is a useful response to `x (question)`.
+    - Output: `{yes, no}`
+
diff --git a/examples/naive_llm/rel-movielens1m/classification/rag-movielens1m_clf.py b/examples/naive_llm/rel-movielens1m/classification/rag-movielens1m_clf.py
@@ -0,0 +1,119 @@
+# -*- coding: ascii -*-
+
+# RAG-enhanced llm pipeline for classification task in rel-movielens1M
+# Paper: Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection  https://arxiv.org/abs/2310.11511
+# Title only: macro_f1: 0.251, micro_f1: 0.387
+# Full info: macro_f1: 0.892, micro_f1: 0.884
+# Runtime: Title only: 2990s; Full info: 6757s (on a single 6G GPU)
+# Cost: Title only: $0.2722; Full info: $0.5996
+# Description: Give llm movie name and limited genres, relevant documents are retrieved from wikipedia database to assist llm in predicting the genres of movies. We introduce self-rag to critique the retrieval and generation with critique tokens.
+# Usage: python rag-movielens1m_clf.py --prompt title/all
+
+# Append rllm to search path
+import sys
+sys.path.append("../../../../")
+import time
+import argparse
+
+import pandas as pd
+
+from tqdm import tqdm
+from sklearn.preprocessing import MultiLabelBinarizer
+
+from langchain_community.llms import LlamaCpp
+from langchain.prompts import PromptTemplate
+from langchain.schema import BaseOutputParser
+from itertools import islice
+import bs4
+from langchain import hub
+from langchain_community.document_loaders import WebBaseLoader
+from langchain_chroma import Chroma
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+from langchain_community.embeddings import HuggingFaceEmbeddings, GPT4AllEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from rllm.utils import macro_f1_score, micro_f1_score, get_llm_chat_cost
+from rllm.selfrag_func import self_rag
+##### Parse argument
+parser = argparse.ArgumentParser()
+parser.add_argument('--prompt', choices=['title', 'all'], 
+                    default='title', help='Choose prompt type.')
+args = parser.parse_args()
+
+##### Start time
+time_start = time.time()
+
+##### Global variables
+total_cost = 0
+test_path = "/home/qinghua_mao/work/rllm/rllm/datasets/rel-movielens1m/classification/movies/test.csv"
+
+def parse(text: str):
+    """Parse the output of LLM call."""
+    genres = text.split('::')[-1]
+    genre_list = [genre.strip() for genre in genres.split(',')]
+    return genre_list
+
+class GenreOutputParser(BaseOutputParser):
+    """Parse the output of LLM to a genre list"""
+
+    def parse(self, text: str):
+        """Parse the output of LLM call."""
+        genres = text.split('::')[-1]
+        genre_list = [genre.strip() for genre in genres.split(',')]
+        return genre_list
+
+output_parser = GenreOutputParser()
+
+# Load documents from persist directory of vectorstore.
+model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"
+gpt4all_kwargs = {'allow_download': 'True'}
+embeddings = GPT4AllEmbeddings(
+    model_name = model_name,
+    gpt4all_kwargs = gpt4all_kwargs
+)
+vectorstore = Chroma(persist_directory="/home/qinghua_mao/work/rllm/chroma_wiki", collection_name="rag-chroma", embedding_function=embeddings)
+
+# retrieve and generate using the relevant snippets of the blog
+retriever = vectorstore.as_retriever()
+
+##### 2. LLM prediction
+movie_df = pd.read_csv(test_path)
+
+pred_genre_list = []
+if args.prompt == 'title':
+    for index, row in tqdm(islice(movie_df.iterrows(),300), total=min(len(movie_df),300), desc="Processing Movies"):
+        pred, prompt_cost = self_rag(movie_name=row['Title'], prompt="title", retriever=retriever)
+        total_cost = total_cost + prompt_cost
+        pred_genre_list.append(parse(pred))
+
+        total_cost = total_cost + get_llm_chat_cost(','.join(parse(pred)), 'output')
+else:
+    for index, row in tqdm(islice(movie_df.iterrows(),300), total=min(len(movie_df),300), desc="Processing Movies"):
+
+        pred, prompt_cost = self_rag_all(prompt="all", retriever=retriever, Title=row['Title'], Director=row['Director'], Year=row['Year'], 
+                 Genre=row['Genre'], Cast=row['Cast'], Runtime=row['Runtime'], 
+                 Languages=row['Languages'], Certificate=row['Certificate'], 
+                 Plot=row['Plot'])
+        total_cost = total_cost + prompt_cost
+        pred_genre_list.append(parse(pred))
+
+        total_cost = total_cost + get_llm_chat_cost(','.join(parse(pred)), 'output')
+
+##### 3. Calculate macro f1 score
+# Get all genres
+movie_genres = movie_df["Genre"].iloc[:300].str.split("|")
+all_genres = list(set([genre for genres in movie_genres for genre in genres]))
+
+mlb = MultiLabelBinarizer(classes=all_genres)
+real_genres_matrix = mlb.fit_transform(movie_genres)
+pred_genres_matrix = mlb.fit_transform(pred_genre_list)
+macro_f1 = macro_f1_score(real_genres_matrix, pred_genres_matrix)
+micro_f1 = micro_f1_score(real_genres_matrix, pred_genres_matrix)
+
+##### End time
+time_end = time.time()
+
+print(f"macro_f1: {macro_f1}")
+print(f"micro_f1: {micro_f1}")
+print(f"Total time: {time_end - time_start}s")
+print(f"Total USD$: {total_cost}")
diff --git a/examples/naive_llm/rel-movielens1m/classification/rag-movielens1m_clf_all.py b/examples/naive_llm/rel-movielens1m/classification/rag-movielens1m_clf_all.py
@@ -0,0 +1,120 @@
+# -*- coding: ascii -*-
+
+# RAG-enhanced llm pipeline for classification task in rel-movielens1M
+# Paper: Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection  https://arxiv.org/abs/2310.11511
+# Title only: macro_f1: 0.251, micro_f1: 0.387
+# Full info: macro_f1: 0.892, micro_f1: 0.884
+# Runtime: Title only: 2990s; Full info: 6757s (on a single 6G GPU)
+# Cost: Title only: $0.2722; Full info: $0.5996
+# Description: Give llm movie name and limited genres, relevant documents are retrieved from wikipedia database to assist llm in predicting the genres of movies. We introduce self-rag to critique the retrieval and generation with critique tokens.
+# Usage: python rag-movielens1m_clf.py --prompt title/all
+
+# Append rllm to search path
+import sys
+sys.path.append("../../../../")
+import time
+import argparse
+
+import pandas as pd
+
+from tqdm import tqdm
+from sklearn.preprocessing import MultiLabelBinarizer
+
+from langchain_community.llms import LlamaCpp
+from langchain.prompts import PromptTemplate
+from langchain.schema import BaseOutputParser
+from itertools import islice
+import bs4
+from langchain import hub
+from langchain_community.document_loaders import WebBaseLoader
+from langchain_chroma import Chroma
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+from langchain_community.embeddings import HuggingFaceEmbeddings, GPT4AllEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from rllm.utils import macro_f1_score, micro_f1_score, get_llm_chat_cost
+from rllm.selfrag_func import self_rag
+from rllm.selfrag_func_all import self_rag_all
+##### Parse argument
+parser = argparse.ArgumentParser()
+parser.add_argument('--prompt', choices=['title', 'all'], 
+                    default='title', help='Choose prompt type.')
+args = parser.parse_args()
+
+##### Start time
+time_start = time.time()
+
+##### Global variables
+total_cost = 0
+test_path = "/home/qinghua_mao/work/rllm/rllm/datasets/rel-movielens1m/classification/movies/test.csv"
+
+def parse(text: str):
+    """Parse the output of LLM call."""
+    genres = text.split('::')[-1]
+    genre_list = [genre.strip() for genre in genres.split(',')]
+    return genre_list
+
+class GenreOutputParser(BaseOutputParser):
+    """Parse the output of LLM to a genre list"""
+
+    def parse(self, text: str):
+        """Parse the output of LLM call."""
+        genres = text.split('::')[-1]
+        genre_list = [genre.strip() for genre in genres.split(',')]
+        return genre_list
+
+output_parser = GenreOutputParser()
+
+# Load documents from persist directory of vectorstore.
+model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"
+gpt4all_kwargs = {'allow_download': 'True'}
+embeddings = GPT4AllEmbeddings(
+    model_name = model_name,
+    gpt4all_kwargs = gpt4all_kwargs
+)
+vectorstore = Chroma(persist_directory="/home/qinghua_mao/work/rllm/chroma_wiki", collection_name="rag-chroma", embedding_function=embeddings)
+
+# retrieve and generate using the relevant snippets of the blog
+retriever = vectorstore.as_retriever()
+
+##### 2. LLM prediction
+movie_df = pd.read_csv(test_path)
+
+pred_genre_list = []
+if args.prompt == 'title':
+    for index, row in tqdm(islice(movie_df.iterrows(),5), total=min(len(movie_df),5), desc="Processing Movies"):
+        pred, prompt_cost = self_rag(movie_name=row['Title'], prompt="title", retriever=retriever)
+        total_cost = total_cost + prompt_cost
+        pred_genre_list.append(parse(pred))
+
+        total_cost = total_cost + get_llm_chat_cost(','.join(parse(pred)), 'output')
+else:
+    for index, row in tqdm(islice(movie_df.iterrows(),5), total=min(len(movie_df),5), desc="Processing Movies"):
+
+        pred, prompt_cost = self_rag_all(prompt="all", retriever=retriever, Title=row['Title'], Director=row['Director'], Year=row['Year'], 
+                 Genre=row['Genre'], Cast=row['Cast'], Runtime=row['Runtime'], 
+                 Languages=row['Languages'], Certificate=row['Certificate'], 
+                 Plot=row['Plot'])
+        total_cost = total_cost + prompt_cost
+        pred_genre_list.append(parse(pred))
+
+        total_cost = total_cost + get_llm_chat_cost(','.join(parse(pred)), 'output')
+
+##### 3. Calculate macro f1 score
+# Get all genres
+movie_genres = movie_df["Genre"].iloc[:5].str.split("|")
+all_genres = list(set([genre for genres in movie_genres for genre in genres]))
+
+mlb = MultiLabelBinarizer(classes=all_genres)
+real_genres_matrix = mlb.fit_transform(movie_genres)
+pred_genres_matrix = mlb.fit_transform(pred_genre_list)
+macro_f1 = macro_f1_score(real_genres_matrix, pred_genres_matrix)
+micro_f1 = micro_f1_score(real_genres_matrix, pred_genres_matrix)
+
+##### End time
+time_end = time.time()
+
+print(f"macro_f1: {macro_f1}")
+print(f"micro_f1: {micro_f1}")
+print(f"Total time: {time_end - time_start}s")
+print(f"Total USD$: {total_cost}")
diff --git a/examples/naive_llm/rel-movielens1m/classification/rel-movielens1m_clf.py b/examples/naive_llm/rel-movielens1m/classification/rel-movielens1m_clf.py
@@ -24,6 +24,14 @@
 from langchain.prompts import PromptTemplate
 from langchain.schema import BaseOutputParser
 
+import bs4
+from langchain import hub
+from langchain_community.document_loaders import WebBaseLoader
+from langchain_chroma import Chroma
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
 from rllm.utils import macro_f1_score, micro_f1_score, get_llm_chat_cost
 
 ##### Parse argument
@@ -37,8 +45,23 @@
 
 ##### Global variables
 total_cost = 0
-test_path = "your/test_file/path"
-llm_model_path = "your/llm/path"
+test_path = "/home/qinghua_mao/work/rllm/rllm/datasets/rel-movielens1m/classification/movies/test.csv"
+llm_model_path = "/home/qinghua_mao/work/rllm/gemma-2b-it-GGUF/gemma-2b-it-q4_k_m.gguf"
+embed_path = "/home/qinghua_mao/work/rllm/all-MiniLM-L6-v2"
+
+from langchain.embeddings.base import Embeddings
+from sentence_transformers import SentenceTransformer
+from typing import List
+
+class CustomEmbeddings(Embeddings):
+    def __init__(self, model_name: str):
+        self.model = SentenceTransformer(model_name)
+
+    def embed_documents(self, documents: List[str]) -> List[List[float]]:
+        return [self.model.encode(d).tolist() for d in documents]
+
+    def embed_query(self, query: str) -> List[float]:
+        return self.model.encode([query])[0].tolist()
 
 ##### 1. Construct LLM chain
 # Load model
@@ -104,6 +127,7 @@ def parse(self, text: str):
         total_cost = total_cost + get_llm_chat_cost(prompt_title_template.invoke({"movie_name": row['Title']}).text, 'input')
 
         pred = chain.invoke({"movie_name": row['Title']})
+        print(pred)
         pred_genre_list.append(pred)
 
         total_cost = total_cost + get_llm_chat_cost(','.join(pred), 'output')

diff --git a/examples/naive_llm/rel-movielens1m/regression/rel-movielens1m_reg.py b/examples/naive_llm/rel-movielens1m/regression/rel-movielens1m_reg.py
@@ -27,11 +27,25 @@
 
 ##### Global variables
 total_cost = 0
-train_path = "your/train_file/path"
-movie_path = "your/movie_file/path"
-test_path = "your/test_file/path"
-llm_model_path = "your/llm/path"
+train_path = "/home/qinghua_mao/work/rllm/rllm/datasets/rel-movielens1m/regression/ratings/train.csv"
+movie_path = "/home/qinghua_mao/work/rllm/rllm/datasets/rel-movielens1m/regression/movies.csv"
+test_path = "/home/qinghua_mao/work/rllm/rllm/datasets/rel-movielens1m/regression/ratings/test.csv"
+llm_model_path = "/home/qinghua_mao/work/rllm/gemma-2b-it-GGUF/gemma-2b-it-q4_k_m.gguf"
+embed_path = "/home/qinghua_mao/work/rllm/all-MiniLM-L6-v2"
 
+from langchain.embeddings.base import Embeddings
+from sentence_transformers import SentenceTransformer
+from typing import List
+
+class CustomEmbeddings(Embeddings):
+    def __init__(self, model_name: str):
+        self.model = SentenceTransformer(model_name)
+
+    def embed_documents(self, documents: List[str]) -> List[List[float]]:
+        return [self.model.encode(d).tolist() for d in documents]
+
+    def embed_query(self, query: str) -> List[float]:
+        return self.model.encode([query])[0].tolist()
 
 ##### 1. Construct LLM chain
 # Load model