From cb6c7a13f02bd2581994bc29300cef21014f82d4 Mon Sep 17 00:00:00 2001 From: Cheez22 Date: Fri, 18 Oct 2024 17:06:20 -0400 Subject: [PATCH 1/4] PromptClip with Gemini, output modified This version of the code uses Google's Gemini. The output has been modified to improve video editing efficiency, turning PromptClip into more of a search engine for videos. --- PromptClip_with_Gemini.ipynb | 641 +++++++++++++++++++++++++++++++++++ 1 file changed, 641 insertions(+) create mode 100644 PromptClip_with_Gemini.ipynb diff --git a/PromptClip_with_Gemini.ipynb b/PromptClip_with_Gemini.ipynb new file mode 100644 index 0000000..4a19735 --- /dev/null +++ b/PromptClip_with_Gemini.ipynb @@ -0,0 +1,641 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "zJ9lTpS1cWze" + }, + "outputs": [], + "source": [ + "!pip install -r requirements.txt\n", + "!pip install -U videodb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eoqzfNylPAsI" + }, + "outputs": [], + "source": [ + "#PromptClip_multimodal\n", + "\n", + "import os\n", + "import videodb\n", + "import sys\n", + "from dotenv import load_dotenv\n", + "load_dotenv()\n", + "\n", + "# Connect to VideoDB\n", + "conn = videodb.connect(api_key=\"\") # API KEY FOR VIDEODB\n", + "coll = conn.get_collection()\n", + "\n", + "video_id = \"\" # VIDEO ID\n", + "video_url = \"\" # VIDEO URL\n", + "\n", + "if not video_id:\n", + " video = coll.upload(url=video_url)\n", + "else:\n", + " video = coll.get_video(video_id)\n", + "print(f\"VIDEO ID: {video.id}, VIDEO NAME: {video.name}\")\n", + "video.play()\n", + "\n", + "scene_index_id = \"\" # Insert scenex index ID, or leave blank if not indexed.\n", + "\n", + "if scene_index_id:\n", + " # If scene_index_id is provided, retrieve the scenes using it.\n", + " try:\n", + " print(f\"Using scene index ID: {scene_index_id}\")\n", + " # Assuming video.get_scene_index() is the correct method to use for this purpose:\n", + " scenes = video.get_scene_index(scene_index_id)\n", + " except Exception as e:\n", + " print(f\"Error retrieving scenes from scene index: {e}\")\n", + "else:\n", + " # If scene_index_id is not available, process transcript from spoken words.\n", + " print(\"Scene index ID not found, processing transcript from spoken words...\")\n", + " try:\n", + " transcript = video.get_transcript()\n", + " transcript_text = video.get_transcript_text()\n", + " except Exception as e:\n", + " print(f\"No transcript found, indexing spoken words... Error: {e}\")\n", + " video.index_spoken_words()\n", + " transcript = video.get_transcript()\n", + " transcript_text = video.get_transcript_text()\n", + "\n", + "# Index scenes if scene_index_id is not available\n", + "if not scene_index_id:\n", + " scene_index_id = video.index_scenes(\n", + " prompt=\"Summarize the essence of the scene in one or two concise sentences\" # Provides description\n", + " )\n", + " # Retrieve scenes using the new index.\n", + " scenes = video.get_scene_index(scene_index_id)\n", + "print(f\"Scene Count: {len(scenes)}\\n\")\n", + "\n", + "print(\"TIMESTAMPS:\")\n", + "for scene in scenes:\n", + " print(f\"{scene['start']}-{scene['end']}: {scene['description']}\") #make this into loggging maybe?\n", + "\n", + "user_prompt = \"Find the 5 strongest looking animal scenes, they must not be connected orignally\" # INSERT USER PROMPT\n", + "\n", + "result = scene_prompter(scenes, user_prompt, LLM(llm_type=LLMType.GEMINI, model=Models.GEMINI_1_5_FLASH), run_concurrent=False)\n", + "\n", + "from videodb import play_stream\n", + "from videodb.timeline import Timeline\n", + "\n", + "\n", + "timeline = Timeline(conn)\n", + "result_timestamps = get_result_timestamps(video, result, scene_index_id=scene_index_id)\n", + "reversed_scene_index={}\n", + "\n", + "timeline, duration = build_video_timeline(video, result_timestamps, timeline, max_duration=15) #max_duration must be 10 seconds or longer\n", + "\n", + "stream_url = timeline.generate_stream()\n", + "print(f\"Stream URL: {stream_url}\") #Prints the downloadable stream URL, m3u8 format\n", + "play_stream(stream_url)\n", + "\n", + "#IMPORANT: Get timestamps in a good way to see it" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "OPXNIhMrsL_h" + }, + "outputs": [], + "source": [ + "#llm_agent.py\n", + "\n", + "import json\n", + "import os\n", + "\n", + "import requests\n", + "from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT\n", + "import google.generativeai as genai\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()\n", + "OPENAI_KEY = os.getenv(\"OPENAI_API_KEY\")\n", + "CLAUDE_KEY = os.getenv(\"ANTHROPIC_KEY\")\n", + "GEMINI_KEY = \"\" # API KEY FOR GEMINI\n", + "\n", + "\n", + "class LLMType:\n", + " OPENAI = \"openAI\"\n", + " CLAUDE = \"claude\"\n", + " GEMINI = \"gemini\"\n", + "\n", + "\n", + "class Models:\n", + " GPT3 = \"gpt-3.5-turbo-16k\"\n", + " GPT4 = \"gpt-4\"\n", + " GPT4o = \"gpt-4o\"\n", + " GPT4o_new = \"gpt-4o-2024-08-06\"\n", + " CLAUDE_INSTANT = \"claude-instant-1.1\"\n", + " CLAUDE2 = \"claude-2\"\n", + " GEMINI_1_5_FLASH = \"gemini-1.5-flash\"\n", + " GEMINI_1_5_PRO = \"gemini-1.5-pro\"\n", + " OA_MODELS_WITH_RESPONSE_TYPE_SUPPORT = [GPT4o, GPT4o_new]\n", + "\n", + "\n", + "class LLM:\n", + " def __init__(self, llm_type=LLMType.OPENAI, model=Models.GPT4):\n", + " self.type = llm_type\n", + " self.model = model\n", + " self.openai_key = os.getenv(\"OPENAI_API_KEY\")\n", + " self.claude_key = os.getenv(\"ANTHROPIC_KEY\")\n", + " self.gemini_key = \"\" # API KEY FOR GEMINI\n", + "\n", + " def chat(self, message, functions=None):\n", + " if self.type == LLMType.OPENAI:\n", + " message = [self._to_gpt_msg(message)]\n", + " return self._call_openai(message, functions)\n", + " elif self.type == LLMType.CLAUDE:\n", + " return self._call_claude(message)\n", + " elif self.type == LLMType.GEMINI:\n", + " return self._call_gemini(message)\n", + " else:\n", + " raise ValueError(\"Unsupported LLM type.\")\n", + "\n", + " def _to_gpt_msg(self, data):\n", + " \"\"\"\n", + " convert data to message for LLM\n", + " :param data:\n", + " :return:\n", + " \"\"\"\n", + " context_msg = \"\"\n", + " context_msg += str(data)\n", + "\n", + " return {\"role\": \"system\", \"content\": context_msg}\n", + "\n", + " def _call_openai(self, message, functions=None):\n", + " url = \"https://api.openai.com/v1/chat/completions\"\n", + " headers = {\n", + " \"Content-Type\": \"application/json\",\n", + " \"Authorization\": f\"Bearer {self.openai_key}\",\n", + " }\n", + " data = {\n", + " \"model\": self.model,\n", + " \"messages\": message,\n", + " \"temperature\": 0.6,\n", + " }\n", + " if self.model in Models.OA_MODELS_WITH_RESPONSE_TYPE_SUPPORT:\n", + " data[\"response_format\"] = {\"type\": \"json_object\"}\n", + " if functions:\n", + " data.update(\n", + " {\n", + " \"functions\": functions,\n", + " \"function_call\": \"auto\",\n", + " }\n", + " )\n", + "\n", + " response = requests.post(url, headers=headers, data=json.dumps(data))\n", + " try:\n", + " return response.json()\n", + " except json.JSONDecodeError:\n", + " return {\"error\": \"Failed to decode JSON response.\"}\n", + "\n", + " def _call_claude(self, message):\n", + " anthropic = Anthropic(api_key=self.claude_key)\n", + " prompt = f\"{HUMAN_PROMPT} {message} {AI_PROMPT}\"\n", + " try:\n", + " completion = anthropic.completions.create(\n", + " model=self.model,\n", + " max_tokens_to_sample=80000,\n", + " prompt=prompt,\n", + " )\n", + " return {\"response\": completion.completion}\n", + " except (\n", + " Exception\n", + " ) as e:\n", + " return {\"error\": str(e)}\n", + "\n", + " def _call_gemini(self, message):\n", + " genai.configure(api_key=GEMINI_KEY)\n", + " model = genai.GenerativeModel(self.model)\n", + " try:\n", + " response = model.generate_content(message)\n", + " response_text = response.text.replace(\"```json\", \"\").replace(\"```\", \"\")\n", + " response_json = json.loads(response_text)\n", + " return response_json.get(\"sentences\")\n", + " except Exception as e:\n", + " return {\"error\": str(e)}\n", + "\n", + " def get_word_limit(self):\n", + " if self.type == LLMType.CLAUDE:\n", + " return 10000\n", + " return 2000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "zbkpzAR4sJdr" + }, + "outputs": [], + "source": [ + "#video_prompter.py\n", + "import json\n", + "\n", + "import concurrent.futures\n", + "\n", + "from videodb import connect\n", + "from videodb import SearchType, IndexType\n", + "from videodb.timeline import VideoAsset\n", + "\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()\n", + "\n", + "\n", + "def get_connection():\n", + " \"\"\"\n", + " Get connection and load the env.\n", + " :return:\n", + " \"\"\"\n", + " conn = connect()\n", + " return conn\n", + "\n", + "\n", + "def get_video(id):\n", + " \"\"\"\n", + " Get video object\n", + " :param id:\n", + " :return:\n", + " \"\"\"\n", + " conn = get_connection()\n", + " all_videos = conn.get_collection().get_videos()\n", + " video = next(vid for vid in all_videos if vid.id == id)\n", + " return video\n", + "\n", + "\n", + "def chunk_docs(docs, chunk_size):\n", + " \"\"\"\n", + " chunk docs to fit into context of your LLM\n", + " :param docs:\n", + " :param chunk_size:\n", + " :return:\n", + " \"\"\"\n", + " for i in range(0, len(docs), chunk_size):\n", + " yield docs[i : i + chunk_size]\n", + "\n", + "\n", + "def get_result_timestamps(\n", + " video,\n", + " result,\n", + " index_type=\"scene\",\n", + " scene_index_id=None,\n", + " sort=\"time\",\n", + " run_concurrent=True,\n", + "):\n", + " \"\"\"\n", + " This function takes the result from scene_prompter and performs a keyword search on the video.\n", + " By default, the function sorts the results by time.\n", + " It returns a list of (start, end, description) for the matched segments.\n", + " \"\"\"\n", + " result_timestamps = []\n", + "\n", + " def search_description(description):\n", + " if index_type == \"scene\":\n", + " search_res = video.search(\n", + " description,\n", + " index_type=IndexType.scene,\n", + " search_type=SearchType.keyword,\n", + " scene_index_id=scene_index_id,\n", + " )\n", + " else:\n", + " search_res = video.search(\n", + " description,\n", + " index_type=IndexType.spoken_word,\n", + " search_type=SearchType.keyword,\n", + " )\n", + " matched_segments = search_res.get_shots()\n", + " if len(matched_segments) == 0:\n", + " return None\n", + " video_shot = matched_segments[0]\n", + " return (video_shot.start, video_shot.end, video_shot.text)\n", + "\n", + " if run_concurrent:\n", + " with concurrent.futures.ThreadPoolExecutor() as executor:\n", + " future_to_desc = {\n", + " executor.submit(search_description, desc): desc for desc in result\n", + " }\n", + " for future in concurrent.futures.as_completed(future_to_desc):\n", + " res = future.result()\n", + " if res:\n", + " result_timestamps.append(res)\n", + " else:\n", + " for description in result:\n", + " res = search_description(description)\n", + " if res:\n", + " result_timestamps.append(res)\n", + "\n", + " if sort == \"time\":\n", + " result_timestamps.sort(key=lambda x: x[0])\n", + "\n", + " #Print the found segments\n", + " print(f\"\\nFOUND SEGMENTS ({len(result_timestamps)}):\")\n", + " for start, end, description in result_timestamps:\n", + " print(f\"{start:.3f}-{end:.3f}: {description}\")\n", + "\n", + " return result_timestamps\n", + "\n", + "\n", + "def build_video_timeline(\n", + " video, result_timestamps, timeline, top_n=None, max_duration=None, debug=False\n", + "):\n", + " duration = 0\n", + " added_clips = 0\n", + " added_segments = []\n", + "\n", + " if top_n:\n", + " existing_count = len(result_timestamps)\n", + " result_timestamps = result_timestamps[:top_n]\n", + " if debug:\n", + " print(f\"Picked top {top_n} from {existing_count}\")\n", + "\n", + " for result_timestamp in result_timestamps:\n", + " start = float(result_timestamp[0])\n", + " end = float(result_timestamp[1])\n", + " description = result_timestamp[2]\n", + "\n", + " if debug:\n", + " print(f\"Adding clip {added_clips + 1}: {start} - {end}, Description: {description}\")\n", + "\n", + " if max_duration and duration + (end - start) > max_duration:\n", + " print(\"Max duration reached. Stopping further additions.\")\n", + " break\n", + "\n", + " timeline.add_inline(VideoAsset(asset_id=video.id, start=start, end=end))\n", + " duration += end - start\n", + " added_clips += 1\n", + " added_segments.append((start, end, description)) #Collects added segments\n", + "\n", + " #Prints the added segments\n", + " print(f\"\\nSEGMENTS ADDED ({len(added_segments)}):\")\n", + " for start, end, description in added_segments:\n", + " print(f\"{start:.3f}-{end:.3f}: {description}\")\n", + "\n", + " return timeline, duration\n", + "\n", + "\n", + "\n", + "def filter_transcript(transcript, start, end):\n", + " result = []\n", + " for entry in transcript:\n", + " if float(entry[\"end\"]) > start and float(entry[\"start\"]) < end:\n", + " result.append(entry)\n", + " return result\n", + "\n", + "\n", + "def get_multimodal_docs(transcript, scenes, club_on=\"scene\"):\n", + " docs = []\n", + " if club_on == \"scene\":\n", + " for scene in scenes:\n", + " spoken_result = filter_transcript(\n", + " transcript, float(scene[\"start\"]), float(scene[\"end\"])\n", + " )\n", + " spoken_text = \" \".join(\n", + " entry[\"text\"] for entry in spoken_result if entry[\"text\"] != \"-\"\n", + " )\n", + " data = {\n", + " \"visual\": scene[\"description\"],\n", + " \"spoken\": spoken_text,\n", + " \"start\": scene[\"start\"],\n", + " \"end\": scene[\"end\"],\n", + " }\n", + " docs.append(data)\n", + " return docs\n", + "\n", + "\n", + "def send_msg_openai(chunk_prompt, llm=LLM()):\n", + " response = llm.chat(message=chunk_prompt)\n", + " output = json.loads(response[\"choices\"][0][\"message\"][\"content\"])\n", + " sentences = output.get(\"sentences\")\n", + " return sentences\n", + "\n", + "\n", + "def send_msg_claude(chunk_prompt, llm):\n", + " response = llm.chat(message=chunk_prompt)\n", + " return response\n", + "\n", + "\n", + "def send_msg_gemini(chunk_prompt, llm):\n", + " response = llm.chat(message=chunk_prompt)\n", + " return response\n", + "\n", + "\n", + "def text_prompter(transcript_text, prompt, llm=None):\n", + " chunk_size = 10000\n", + " chunks = chunk_docs(transcript_text, chunk_size=chunk_size)\n", + "\n", + " if llm is None:\n", + " llm = LLM()\n", + "\n", + " if llm.type == LLMType.OPENAI:\n", + " llm_caller_fn = send_msg_openai\n", + " elif llm.type == LLMType.GEMINI:\n", + " llm_caller_fn = send_msg_gemini\n", + " else:\n", + " llm_caller_fn = send_msg_claude\n", + "\n", + " matches = []\n", + " prompts = []\n", + " i = 0\n", + " for chunk in chunks:\n", + " chunk_prompt = \"\"\"\n", + " You are a video editor who uses AI. Given a user prompt and transcript of a video analyze the text to identify sentences in the transcript relevant to the user prompt for making clips.\n", + " - **Instructions**:\n", + " - Evaluate the sentences for relevance to the specified user prompt.\n", + " - Make sure that sentences start and end properly and meaningfully complete the discussion or topic. Choose the one with the greatest relevance and longest.\n", + " - We'll use the sentences to make video clips in future, so optimize for great viewing experience for people watching the clip of these.\n", + " - If the matched sentences are not too far, merge them into one sentence.\n", + " - Strictly make each result minimum 20 words long. If the match is smaller, adjust the boundries and add more context around the sentences.\n", + "\n", + " - **Output Format**: Return a JSON list of strings named 'sentences' that containes the output sentences, make sure they are exact substrings.\n", + " - **User Prompts**: User prompts may include requests like 'find funny moments' or 'find moments for social media'. Interpret these prompts by\n", + " identifying keywords or themes in the transcript that match the intent of the prompt.\n", + " \"\"\"\n", + "\n", + " # pass the data\n", + " chunk_prompt += f\"\"\"\n", + " Transcript: {chunk}\n", + " User Prompt: {prompt}\n", + " \"\"\"\n", + "\n", + " # Add instructions to always return JSON at the end of processing.\n", + " chunk_prompt += \"\"\"\n", + " Ensure the final output strictly adheres to the JSON format specified without including additional text or explanations. \\\n", + " If there is no match return empty list without additional text. Use the following structure for your response:\n", + " {\n", + " \"sentences\": [\n", + " {},\n", + " ...\n", + " ]\n", + " }\n", + " \"\"\"\n", + " prompts.append(chunk_prompt)\n", + " i += 1\n", + "\n", + " # make a parallel call to all chunks with prompts\n", + " with concurrent.futures.ThreadPoolExecutor() as executor:\n", + " future_to_index = {\n", + " executor.submit(llm_caller_fn, prompt, llm): prompt for prompt in prompts\n", + " }\n", + " for future in concurrent.futures.as_completed(future_to_index):\n", + " try:\n", + " matches.extend(future.result())\n", + " except Exception as e:\n", + " print(f\"Chunk failed to work with LLM {str(e)}\")\n", + " return matches\n", + "\n", + "\n", + "def scene_prompter(transcript_text, prompt, llm=None, run_concurrent=True):\n", + " chunk_size = 200\n", + " chunks = chunk_docs(transcript_text, chunk_size=chunk_size)\n", + "\n", + " llm_caller_fn = send_msg_gemini\n", + " if llm is None:\n", + " llm = LLM()\n", + "\n", + " # TODO: llm should have caller function\n", + " # 400 sentence at a time\n", + " if llm.type == LLMType.GEMINI:\n", + " llm_caller_fn = send_msg_gemini\n", + " else:\n", + " # claude for now\n", + " llm_caller_fn = send_msg_claude\n", + "\n", + " matches = []\n", + " prompts = []\n", + " i = 0\n", + "\n", + " for chunk in chunks:\n", + " descriptions = [scene[\"description\"] for scene in chunk]\n", + " chunk_prompt = \"\"\"\n", + " You are a video editor who uses AI. Given a user prompt and AI-generated scene descriptions of a video, analyze the descriptions to identify segments relevant to the user prompt for creating clips.\n", + "\n", + " - **Instructions**:\n", + " - Evaluate the scene descriptions for relevance to the specified user prompt.\n", + " - Choose description with the highest relevance and most comprehensive content.\n", + " - Optimize for engaging viewing experiences, considering visual appeal and narrative coherence.\n", + "\n", + " - User Prompts: Interpret prompts like 'find exciting moments' or 'identify key plot points' by matching keywords or themes in the scene descriptions to the intent of the prompt.\n", + " \"\"\"\n", + "\n", + " chunk_prompt += f\"\"\"\n", + " Descriptions: {json.dumps(descriptions)}\n", + " User Prompt: {prompt}\n", + " \"\"\"\n", + "\n", + " chunk_prompt += \"\"\"\n", + " **Output Format**: Return a JSON list of strings named 'result' that containes the fileds `sentence` Ensure the final output\n", + " strictly adheres to the JSON format specified without including additional text or explanations. \\\n", + " If there is no match return empty list without additional text. Use the following structure for your response:\n", + " {\"sentences\": []}\n", + " \"\"\"\n", + " prompts.append(chunk_prompt)\n", + " i += 1\n", + "\n", + " if run_concurrent:\n", + " with concurrent.futures.ThreadPoolExecutor() as executor:\n", + " future_to_index = {\n", + " executor.submit(llm_caller_fn, prompt, llm): prompt\n", + " for prompt in prompts\n", + " }\n", + " for future in concurrent.futures.as_completed(future_to_index):\n", + " try:\n", + " matches.extend(future.result())\n", + " except Exception as e:\n", + " print(f\"Chunk failed to work with LLM {str(e)}\")\n", + " else:\n", + " for prompt in prompts:\n", + " try:\n", + " res = llm_caller_fn(prompt, llm)\n", + " matches.extend(res)\n", + " except Exception as e:\n", + " print(f\"Chunk failed to work with LLM {str(e)}\")\n", + " return matches\n", + "\n", + "\n", + "def multimodal_prompter(transcript, scene_index, prompt, llm=None, run_concurrent=True):\n", + " docs = get_multimodal_docs(transcript, scene_index)\n", + " chunk_size = 80\n", + " chunks = chunk_docs(docs, chunk_size=chunk_size)\n", + "\n", + " if llm is None:\n", + " llm = LLM()\n", + "\n", + " if llm.type == LLMType.GEMINI:\n", + " llm_caller_fn = send_msg_gemini\n", + " else:\n", + " llm_caller_fn = send_msg_claude\n", + "\n", + " matches = []\n", + " prompts = []\n", + " i = 0\n", + " for chunk in chunks:\n", + " chunk_prompt = f\"\"\"\n", + " You are given visual and spoken information of the video of each second, and a transcipt of what's being spoken along with timestamp.\n", + " Your task is to evaluate the data for relevance to the specified user prompt.\n", + " Corelate visual and spoken content to find the relevant video segment.\n", + "\n", + " Multimodal Data:\n", + " video: {chunk}\n", + " User Prompt: {prompt}\n", + "\n", + "\n", + " \"\"\"\n", + " chunk_prompt += \"\"\"\n", + " **Output Format**: Return a JSON list of strings named 'result' that containes the fileds `sentence`.\n", + " sentence is from the visual section of the input.\n", + " Ensure the final output strictly adheres to the JSON format specified without including additional text or explanations.\n", + " If there is no match return empty list without additional text. Use the following structure for your response:\n", + " {\"sentences\": []}\n", + " \"\"\"\n", + " prompts.append(chunk_prompt)\n", + " i += 1\n", + "\n", + " if run_concurrent:\n", + " with concurrent.futures.ThreadPoolExecutor() as executor:\n", + " future_to_index = {\n", + " executor.submit(llm_caller_fn, prompt, llm): prompt\n", + " for prompt in prompts\n", + " }\n", + " for future in concurrent.futures.as_completed(future_to_index):\n", + " try:\n", + " matches.extend(future.result())\n", + " except Exception as e:\n", + " print(f\"Chunk failed to work with LLM {str(e)}\")\n", + " else:\n", + " for prompt in prompts:\n", + " try:\n", + " res = llm_caller_fn(prompt)\n", + " matches.extend(res)\n", + " except Exception as e:\n", + " import traceback\n", + "\n", + " print(traceback.print_exc())\n", + " print(f\"Chunk failed to work with LLM {str(e)}\")\n", + " return matches" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file From 1f57fceda7188e2780cb64cef9827b3f6ec9b0cf Mon Sep 17 00:00:00 2001 From: Cheez22 Date: Sun, 10 Nov 2024 10:48:12 -0500 Subject: [PATCH 2/4] Gemini llm_agent.py Intended use for Gemini --- llm_agent.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llm_agent.py b/llm_agent.py index 0ce5126..f988e16 100644 --- a/llm_agent.py +++ b/llm_agent.py @@ -13,7 +13,7 @@ load_dotenv() OPENAI_KEY = os.getenv("OPENAI_API_KEY") CLAUDE_KEY = os.getenv("ANTHROPIC_KEY") -GEMINI_KEY = os.getenv("GEMINI_API_KEY") +GEMINI_KEY = "" # API KEY FOR GEMINI class LLMType: @@ -40,7 +40,7 @@ def __init__(self, llm_type=LLMType.OPENAI, model=Models.GPT4): self.model = model self.openai_key = os.getenv("OPENAI_API_KEY") self.claude_key = os.getenv("ANTHROPIC_KEY") - self.gemini_key = os.getenv("GEMINI_KEY") + self.gemini_key = "" # API KEY FOR GEMINI def chat(self, message, functions=None): if self.type == LLMType.OPENAI: @@ -66,7 +66,6 @@ def _to_gpt_msg(self, data): def _call_openai(self, message, functions=None): url = "https://api.openai.com/v1/chat/completions" - # print(f'call openAI with message {message}') headers = { "Content-Type": "application/json", "Authorization": f"Bearer {self.openai_key}", @@ -104,7 +103,7 @@ def _call_claude(self, message): return {"response": completion.completion} except ( Exception - ) as e: # Consider a more specific exception based on the Anthropic SDK + ) as e: return {"error": str(e)} def _call_gemini(self, message): From 6bdf2fdad9228675c66b49dae5f6a7e7ee392b06 Mon Sep 17 00:00:00 2001 From: Cheez22 Date: Sun, 10 Nov 2024 10:49:10 -0500 Subject: [PATCH 3/4] Gemini video_prompter.py Intended for Gemini. --- video_prompter.py | 74 +++++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/video_prompter.py b/video_prompter.py index 412fa5b..e558e74 100644 --- a/video_prompter.py +++ b/video_prompter.py @@ -2,7 +2,6 @@ import concurrent.futures -from llm_agent import LLM, LLMType from videodb import connect from videodb import SearchType, IndexType from videodb.timeline import VideoAsset @@ -42,7 +41,7 @@ def chunk_docs(docs, chunk_size): :return: """ for i in range(0, len(docs), chunk_size): - yield docs[i : i + chunk_size] # Yield the current chunk + yield docs[i : i + chunk_size] def get_result_timestamps( @@ -61,7 +60,6 @@ def get_result_timestamps( result_timestamps = [] def search_description(description): - # keyword search on each result description if index_type == "scene": search_res = video.search( description, @@ -77,8 +75,7 @@ def search_description(description): ) matched_segments = search_res.get_shots() if len(matched_segments) == 0: - return None # No match found - + return None video_shot = matched_segments[0] return (video_shot.start, video_shot.end, video_shot.text) @@ -97,42 +94,56 @@ def search_description(description): if res: result_timestamps.append(res) - # Sorting the results if needed if sort == "time": result_timestamps.sort(key=lambda x: x[0]) + #Print the found segments + print(f"\nFOUND SEGMENTS ({len(result_timestamps)}):") + for start, end, description in result_timestamps: + print(f"{start:.3f}-{end:.3f}: {description}") + return result_timestamps -# Creating and returning timeline of given result timestamps def build_video_timeline( video, result_timestamps, timeline, top_n=None, max_duration=None, debug=False ): - """ - This function takes the matched segments list (result_timestamps) and creates a VideoDB Timeline based on the given conditions. - The user can specify top_n to select the top n results. - Additionally, the user can set max_duration to stop adding results to the Timeline if the total duration exceeds this limit. - """ duration = 0 + added_clips = 0 + added_segments = [] + if top_n: existing_count = len(result_timestamps) result_timestamps = result_timestamps[:top_n] if debug: print(f"Picked top {top_n} from {existing_count}") + for result_timestamp in result_timestamps: start = float(result_timestamp[0]) end = float(result_timestamp[1]) description = result_timestamp[2] + if debug: - print(start, end, description) - duration += end - start - if max_duration and duration > max_duration: - duration -= end - start + print(f"Adding clip {added_clips + 1}: {start} - {end}, Description: {description}") + + if max_duration and duration + (end - start) > max_duration: + print("Max duration reached. Stopping further additions.") break + timeline.add_inline(VideoAsset(asset_id=video.id, start=start, end=end)) + duration += end - start + added_clips += 1 + added_segments.append((start, end, description)) #Collects added segments + + #Prints the added segments + print(f"\nSEGMENTS ADDED ({len(added_segments)}):") + for start, end, description in added_segments: + print(f"{start:.3f}-{end:.3f}: {description}") + return timeline, duration + def filter_transcript(transcript, start, end): result = [] for entry in transcript: @@ -142,7 +153,6 @@ def filter_transcript(transcript, start, end): def get_multimodal_docs(transcript, scenes, club_on="scene"): - # TODO: Implement club on transcript docs = [] if club_on == "scene": for scene in scenes: @@ -171,32 +181,26 @@ def send_msg_openai(chunk_prompt, llm=LLM()): def send_msg_claude(chunk_prompt, llm): response = llm.chat(message=chunk_prompt) - # TODO : add claude reposnse parser return response def send_msg_gemini(chunk_prompt, llm): response = llm.chat(message=chunk_prompt) - # TODO : add claude reposnse parser return response def text_prompter(transcript_text, prompt, llm=None): chunk_size = 10000 - # sentence tokenizer chunks = chunk_docs(transcript_text, chunk_size=chunk_size) - # print(f"Length of the sentence chunk are {len(chunks)}") if llm is None: llm = LLM() - # 400 sentence at a time if llm.type == LLMType.OPENAI: llm_caller_fn = send_msg_openai elif llm.type == LLMType.GEMINI: llm_caller_fn = send_msg_gemini else: - # claude for now llm_caller_fn = send_msg_claude matches = [] @@ -204,8 +208,8 @@ def text_prompter(transcript_text, prompt, llm=None): i = 0 for chunk in chunks: chunk_prompt = """ - You are a video editor who uses AI. Given a user prompt and transcript of a video analyze the text to identify sentences in the transcript relevant to the user prompt for making clips. - - **Instructions**: + You are a video editor who uses AI. Given a user prompt and transcript of a video analyze the text to identify sentences in the transcript relevant to the user prompt for making clips. + - **Instructions**: - Evaluate the sentences for relevance to the specified user prompt. - Make sure that sentences start and end properly and meaningfully complete the discussion or topic. Choose the one with the greatest relevance and longest. - We'll use the sentences to make video clips in future, so optimize for great viewing experience for people watching the clip of these. @@ -213,7 +217,7 @@ def text_prompter(transcript_text, prompt, llm=None): - Strictly make each result minimum 20 words long. If the match is smaller, adjust the boundries and add more context around the sentences. - **Output Format**: Return a JSON list of strings named 'sentences' that containes the output sentences, make sure they are exact substrings. - - **User Prompts**: User prompts may include requests like 'find funny moments' or 'find moments for social media'. Interpret these prompts by + - **User Prompts**: User prompts may include requests like 'find funny moments' or 'find moments for social media'. Interpret these prompts by identifying keywords or themes in the transcript that match the intent of the prompt. """ @@ -251,17 +255,17 @@ def text_prompter(transcript_text, prompt, llm=None): def scene_prompter(transcript_text, prompt, llm=None, run_concurrent=True): - chunk_size = 100 + chunk_size = 200 chunks = chunk_docs(transcript_text, chunk_size=chunk_size) - llm_caller_fn = send_msg_openai + llm_caller_fn = send_msg_gemini if llm is None: llm = LLM() # TODO: llm should have caller function # 400 sentence at a time - if llm.type == LLMType.OPENAI: - llm_caller_fn = send_msg_openai + if llm.type == LLMType.GEMINI: + llm_caller_fn = send_msg_gemini else: # claude for now llm_caller_fn = send_msg_claude @@ -275,7 +279,7 @@ def scene_prompter(transcript_text, prompt, llm=None, run_concurrent=True): chunk_prompt = """ You are a video editor who uses AI. Given a user prompt and AI-generated scene descriptions of a video, analyze the descriptions to identify segments relevant to the user prompt for creating clips. - - **Instructions**: + - **Instructions**: - Evaluate the scene descriptions for relevance to the specified user prompt. - Choose description with the highest relevance and most comprehensive content. - Optimize for engaging viewing experiences, considering visual appeal and narrative coherence. @@ -311,7 +315,7 @@ def scene_prompter(transcript_text, prompt, llm=None, run_concurrent=True): else: for prompt in prompts: try: - res = llm_caller_fn(prompt) + res = llm_caller_fn(prompt, llm) matches.extend(res) except Exception as e: print(f"Chunk failed to work with LLM {str(e)}") @@ -326,8 +330,8 @@ def multimodal_prompter(transcript, scene_index, prompt, llm=None, run_concurren if llm is None: llm = LLM() - if llm.type == LLMType.OPENAI: - llm_caller_fn = send_msg_openai + if llm.type == LLMType.GEMINI: + llm_caller_fn = send_msg_gemini else: llm_caller_fn = send_msg_claude @@ -344,7 +348,7 @@ def multimodal_prompter(transcript, scene_index, prompt, llm=None, run_concurren video: {chunk} User Prompt: {prompt} - + """ chunk_prompt += """ **Output Format**: Return a JSON list of strings named 'result' that containes the fileds `sentence`. From 5e2f528628ec04dc24da3ae23e3f04300b6992cd Mon Sep 17 00:00:00 2001 From: Cheez22 Date: Sat, 16 Nov 2024 14:55:54 -0500 Subject: [PATCH 4/4] ModifyingV1 PromptClip_multimodal.ipynb Not finished yet. --- PromptClip_multimodal.ipynb | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/PromptClip_multimodal.ipynb b/PromptClip_multimodal.ipynb index bb599df..5a33242 100644 --- a/PromptClip_multimodal.ipynb +++ b/PromptClip_multimodal.ipynb @@ -53,11 +53,11 @@ "load_dotenv()\n", "\n", "# Connect to VideoDB\n", - "conn = videodb.connect()\n", + "conn = videodb.connect(api_key="") # API KEY FOR VIDEODB\n", "coll = conn.get_collection()\n", "\n", "# TODO: Add video_id if video already exists in the collection\n", - "video_id = os.getenv(\"MULTIMODAL_DEMO_VIDEO_ID\")\n", + "video_id = "MULTIMODAL_DEMO_VIDEO_ID\n", "video_url = \"https://www.youtube.com/watch?v=NZGLHdcw2RM\"\n", "\n", "if not video_id:\n", @@ -65,7 +65,7 @@ "else:\n", " video = coll.get_video(video_id)\n", "\n", - "print(f\"video_id: {video.id}, name: {video.name}\")\n", + "print(f"VIDEO ID: {video.id}, VIDEO NAME: {video.name}")\n", "video.play()" ] }, @@ -116,14 +116,21 @@ "outputs": [], "source": [ "# Add scene_index_id here if already indexed.\n", - "scene_index_id = os.getenv(\"MULTIMODAL_DEMO_SCENE_INDEX_ID\")\n", - "\n", + "scene_index_id = ""\n", + "if scene_index_id:\n", + " # If scene_index_id is provided, retrieve the scenes using it.\n", + " try:\n", + " print(f"Using scene index ID: {scene_index_id}")\n", + " scenes = video.get_scene_index(scene_index_id)\n", + " except Exception as e:\n", + "print(f"Error retrieving scenes from scene index: {e}")\n", + "if not scene_index_id:\n", " scene_index_id = video.index_scenes(\n", " prompt=\"Summarize the essence of the scene in one or two concise sentences.\"\n", " )\n", "scenes = video.get_scene_index(scene_index_id)\n", - "print(f\"Video is indexed with scene_index_id {scene_index_id} and has {len(scenes)} scenes.\")" + "print(f"Scene Count: {len(scenes)}\n")" ] }, {