commit

abeglova · abeglova · commit d537e3b4d01b · 2025-10-30T16:39:59.000-04:00
diff --git a/frontends/api/src/generated/v1/api.ts b/frontends/api/src/generated/v1/api.ts
diff --git a/frontends/main/package.json b/frontends/main/package.json
@@ -13,7 +13,7 @@
     "@ebay/nice-modal-react": "^1.2.13",
     "@emotion/cache": "^11.13.1",
     "@emotion/styled": "^11.11.0",
-    "@mitodl/course-search-utils": "git+https://github.com/mitodl/course-search-utils.git#ab/add-hybrid-search-flag",
+    "@mitodl/course-search-utils": "^3.4.1",
     "@mitodl/mitxonline-api-axios": "^2025.10.21",
     "@mitodl/smoot-design": "^6.17.1",
     "@next/bundle-analyzer": "^14.2.15",
@@ -37,6 +37,7 @@
     "react": "^19.0.0",
     "react-dom": "^19.0.0",
     "react-slick": "^0.30.2",
+    "sharp": "0.34.4",
     "slick-carousel": "^1.8.1",
     "tiny-invariant": "^1.3.3",
     "yup": "^1.4.0"
diff --git a/frontends/main/src/app-pages/SearchPage/SearchPage.tsx b/frontends/main/src/app-pages/SearchPage/SearchPage.tsx
@@ -115,9 +115,6 @@ const SearchPage: React.FC = () => {
     onFacetsChange,
   })
 
-  console.log(params)
-  console.log(searchParams)
-
   const page = +(searchParams.get("page") ?? "1")
 
   useEffect(() => {
diff --git a/frontends/main/src/page-components/SearchDisplay/SearchDisplay.tsx b/frontends/main/src/page-components/SearchDisplay/SearchDisplay.tsx
@@ -639,7 +639,6 @@ const SearchDisplay: React.FC<SearchDisplayProps> = ({
     min_score: number
     max_incompleteness_penalty: number
     content_file_score_weight: number
-    use_hybrid_search: string
   }
 
   const AdminOptions = (
diff --git a/frontends/main/src/page-components/SearchDisplay/getSearchParams.ts b/frontends/main/src/page-components/SearchDisplay/getSearchParams.ts
@@ -8,7 +8,6 @@ import type {
   Facets,
   BooleanFacets,
 } from "@mitodl/course-search-utils"
-import { use } from "react"
 
 export const PAGE_SIZE = 20
 
@@ -40,7 +39,6 @@ const getSearchParams = ({
     max_incompleteness_penalty: searchParams.get("max_incompleteness_penalty"),
     content_file_score_weight: searchParams.get("content_file_score_weight"),
     resource_category: resourceCategory ? [resourceCategory] : null,
-    use_hybrid_search: searchParams.get("use_hybrid_search"),
     aggregations: [...(facetNames || []), "resource_category"],
     ...requestParams,
     offset: (Number(page) - 1) * pageSize,
diff --git a/learning_resources_search/api.py b/learning_resources_search/api.py
@@ -13,13 +13,15 @@
 from learning_resources.models import LearningResource
 from learning_resources_search.connection import (
     get_default_alias_name,
+    get_vector_model_id,
 )
 from learning_resources_search.constants import (
     COMBINED_INDEX,
     CONTENT_FILE_TYPE,
     COURSE_QUERY_FIELDS,
     COURSE_TYPE,
     DEPARTMENT_QUERY_FIELDS,
+    HYBRID_SEARCH_MODE,
     LEARNING_RESOURCE,
     LEARNING_RESOURCE_QUERY_FIELDS,
     LEARNING_RESOURCE_SEARCH_SORTBY_OPTIONS,
@@ -63,7 +65,7 @@ def gen_content_file_id(content_file_id):
     return f"cf_{content_file_id}"
 
 
-def relevant_indexes(resource_types, aggregations, endpoint, use_hybrid_search=False):
+def relevant_indexes(resource_types, aggregations, endpoint, use_hybrid_search):
     """
     Return list of relevent index type for the query
 
@@ -143,7 +145,11 @@ def generate_sort_clause(search_params):
         return sort
 
 
-def wrap_text_clause(text_query, min_score=None, use_hybrid_search=False):
+def wrap_text_clause(
+    text_query,
+    use_hybrid_search,
+    min_score=None,
+):
     """
     Wrap the text subqueries in a bool query
     Shared by generate_content_file_text_clause and
@@ -209,11 +215,11 @@ def generate_content_file_text_clause(text):
     else:
         text_query = {}
 
-    return wrap_text_clause(text_query)
+    return wrap_text_clause(text_query, use_hybrid_search=False)
 
 
 def generate_learning_resources_text_clause(
-    text, search_mode, slop, content_file_score_weight, min_score, use_hybrid_search
+    text, search_mode, slop, content_file_score_weight, min_score
 ):
     """
     Return text clause for the query
@@ -224,16 +230,23 @@ def generate_learning_resources_text_clause(
         dict: dictionary with the opensearch text clause
     """
 
+    use_hybrid_search = search_mode == HYBRID_SEARCH_MODE
+
     query_type = (
         "query_string" if text.startswith('"') and text.endswith('"') else "multi_match"
     )
 
     extra_params = {}
 
-    if query_type == "multi_match" and search_mode:
-        extra_params["type"] = search_mode
+    if use_hybrid_search:
+        text_search_mode = settings.DEFAULT_SEARCH_MODE
+    else:
+        text_search_mode = search_mode
+
+    if query_type == "multi_match":
+        extra_params["type"] = text_search_mode
 
-        if search_mode == "phrase" and slop:
+        if text_search_mode == "phrase" and slop:
             extra_params["slop"] = slop
 
     if content_file_score_weight is not None:
@@ -337,7 +350,7 @@ def generate_learning_resources_text_clause(
     else:
         text_query = {}
 
-    return wrap_text_clause(text_query, min_score, use_hybrid_search)
+    return wrap_text_clause(text_query, use_hybrid_search, min_score)
 
 
 def generate_filter_clause(
@@ -587,7 +600,6 @@ def add_text_query_to_search(
             search_params.get("slop"),
             search_params.get("content_file_score_weight"),
             search_params.get("min_score"),
-            use_hybrid_search,
         )
 
     yearly_decay_percent = search_params.get("yearly_decay_percent")
@@ -637,11 +649,17 @@ def add_text_query_to_search(
         text_query = {"bool": {"must": [text_query], "filter": query_type_query}}
 
     if use_hybrid_search:
+        vector_model_id = get_vector_model_id()
+        if not vector_model_id:
+            log.error("Vector model not found. Cannot perform hybrid search.")
+            error_message = "Vector model not found."
+            raise ValueError(error_message)
+
         vector_query_description = {
             "neural": {
                 "description_embedding": {
                     "query_text": text,
-                    "model_id": "PQBFF5oBDk6_T5cL_Izk",
+                    "model_id": vector_model_id,
                     "min_score": 0.015,
                 },
             }
@@ -651,7 +669,7 @@ def add_text_query_to_search(
             "neural": {
                 "title_embedding": {
                     "query_text": text,
-                    "model_id": "PQBFF5oBDk6_T5cL_Izk",
+                    "model_id": vector_model_id,
                     "min_score": 0.015,
                 },
             }
@@ -675,7 +693,7 @@ def add_text_query_to_search(
     return search
 
 
-def construct_search(search_params):
+def construct_search(search_params):  # noqa: C901
     """
     Construct a learning resources search based on the query
 
@@ -694,7 +712,7 @@ def construct_search(search_params):
     ):
         search_params["resource_type"] = list(LEARNING_RESOURCE_TYPES)
 
-    use_hybrid_search = search_params.get("use_hybrid_search", False)
+    use_hybrid_search = search_params.get("search_mode") == HYBRID_SEARCH_MODE
 
     indexes = relevant_indexes(
         search_params.get("resource_type"),
@@ -706,7 +724,7 @@ def construct_search(search_params):
     search = Search(index=",".join(indexes))
 
     search = search.source(fields={"excludes": SOURCE_EXCLUDED_FIELDS})
-    if not search_params.get("use_hybrid_search"):
+    if not use_hybrid_search:
         search = search.params(search_type="dfs_query_then_fetch")
     if search_params.get("offset"):
         search = search.extra(from_=search_params.get("offset"))
@@ -763,12 +781,12 @@ def execute_learn_search(search_params):
     Returns:
         dict: The opensearch response dict
     """
-    print(search_params)
     if search_params.get("endpoint") != CONTENT_FILE_TYPE:
         if search_params.get("yearly_decay_percent") is None:
             search_params["yearly_decay_percent"] = (
                 settings.DEFAULT_SEARCH_STALENESS_PENALTY
             )
+
         if search_params.get("search_mode") is None:
             search_params["search_mode"] = settings.DEFAULT_SEARCH_MODE
         if search_params.get("slop") is None:
@@ -781,7 +799,7 @@ def execute_learn_search(search_params):
             )
     search = construct_search(search_params)
 
-    if search_params.get("use_hybrid_search"):
+    if search_params.get("search_mode") == HYBRID_SEARCH_MODE:
         search = search.extra(
             search_pipeline={
                 "description": "Post processor for hybrid search",
@@ -799,7 +817,6 @@ def execute_learn_search(search_params):
             }
         )
 
-    print(search.to_dict())
     return search.execute().to_dict()
 
 
@@ -915,7 +932,9 @@ def get_similar_topics(
         list of str:
             list of topic values
     """
-    indexes = relevant_indexes([COURSE_TYPE], [], endpoint=LEARNING_RESOURCE)
+    indexes = relevant_indexes(
+        [COURSE_TYPE], [], endpoint=LEARNING_RESOURCE, use_hybrid_search=False
+    )
     search = Search(index=",".join(indexes))
     search = search.filter("term", resource_type=COURSE_TYPE)
     search = search.query(
@@ -1053,7 +1072,9 @@ def get_similar_resources_opensearch(
         list of str:
             list of learning resources
     """
-    indexes = relevant_indexes(LEARNING_RESOURCE_TYPES, [], endpoint=LEARNING_RESOURCE)
+    indexes = relevant_indexes(
+        LEARNING_RESOURCE_TYPES, [], endpoint=LEARNING_RESOURCE, use_hybrid_search=False
+    )
     search = Search(index=",".join(indexes))
     if num_resources:
         # adding +1 to num_resources since we filter out existing resource.id
diff --git a/learning_resources_search/connection.py b/learning_resources_search/connection.py
@@ -135,3 +135,17 @@ def refresh_index(index):
     """
     conn = get_conn()
     conn.indices.refresh(index)
+
+
+def get_vector_model_id():
+    conn = get_conn()
+    model_name = "huggingface/sentence-transformers/msmarco-distilbert-base-tas-b"
+    body = {"query": {"term": {"name.keyword": model_name}}}
+    models = conn.transport.perform_request(
+        "GET", "/_plugins/_ml/models/_search", body=body
+    )
+
+    if len(models.get("hits", {}).get("hits", [])) > 0:
+        return models["hits"]["hits"][0]["_source"]["model_id"]
+
+    return None
diff --git a/learning_resources_search/constants.py b/learning_resources_search/constants.py
@@ -25,6 +25,7 @@
 COMBINED_INDEX = "combined_hybrid"
 
 LEARNING_RESOURCE = "learning_resource"
+HYBRID_SEARCH_MODE = "hybrid"
 
 
 class IndexestoUpdate(Enum):
diff --git a/learning_resources_search/indexing_api.py b/learning_resources_search/indexing_api.py
@@ -8,6 +8,7 @@
 
 from django.conf import settings
 from django.contrib.auth import get_user_model
+from opensearch_py_ml.ml_commons import MLCommonClient
 from opensearchpy.exceptions import ConflictError, NotFoundError
 from opensearchpy.helpers import BulkIndexError, bulk
 
@@ -17,6 +18,7 @@
     get_conn,
     get_default_alias_name,
     get_reindexing_alias_name,
+    get_vector_model_id,
     make_backing_index_name,
     refresh_index,
 )
@@ -43,7 +45,6 @@
 
 log = logging.getLogger(__name__)
 User = get_user_model()
-from opensearch_py_ml.ml_commons import MLCommonClient
 
 
 def clear_featured_rank(rank, clear_all_greater_than):
@@ -636,7 +637,7 @@ def get_existing_reindexing_indexes(obj_types):
     return reindexing_indexes
 
 
-def update_index_settings():
+def update_local_index_settings_for_hybrid_search():
     settings_body = {
         "persistent": {
             "plugins": {
@@ -654,32 +655,32 @@ def update_index_settings():
 
 def get_ml_client():
     conn = get_conn()
-    ml_client = MLCommonClient(conn)
-    return ml_client
+    return MLCommonClient(conn)
 
 
 def register_model():
     ml_client = get_ml_client()
-    model_id = ml_client.register_pretrained_model(
+    ml_client.register_pretrained_model(
         model_name="huggingface/sentence-transformers/msmarco-distilbert-base-tas-b",
         model_version="1.0.3",
         model_format="TORCH_SCRIPT",
         deploy_model=True,
     )
 
 
-# In [11]: model_id
-# Out[49]: 'PQBFF5oBDk6_T5cL_Izk'
-
-
 def create_ingest_pipeline():
     conn = get_conn()
+    model_id = get_vector_model_id()
+    if not model_id:
+        log.error("Model not found. Cannot create ingest pipeline.")
+        return
+
     pipeline = {
         "description": "An NLP ingest pipeline",
         "processors": [
             {
                 "text_embedding": {
-                    "model_id": "PQBFF5oBDk6_T5cL_Izk",
+                    "model_id": model_id,
                     "field_map": {
                         "description": "description_embedding",
                         "title": "title_embedding",
@@ -690,25 +691,3 @@ def create_ingest_pipeline():
     }
 
     conn.ingest.put_pipeline("vector_ingest_pipeline", pipeline)
-
-
-def create_search_pipeline():
-    conn = get_conn()
-    pipeline = {
-        "description": "Post processor for hybrid search",
-        "phase_results_processors": [
-            {
-                "normalization-processor": {
-                    "normalization": {"technique": "min_max"},
-                    "combination": {
-                        "technique": "arithmetic_mean",
-                        "parameters": {"weights": [0.7, 0.3]},
-                    },
-                }
-            }
-        ],
-    }
-
-    conn.transport.perform_request(
-        "PUT", "/_search/pipeline/hybrid_search_pipeline", body=pipeline
-    )
diff --git a/learning_resources_search/serializers.py b/learning_resources_search/serializers.py
diff --git a/openapi/specs/v1.yaml b/openapi/specs/v1.yaml
diff --git a/yarn.lock b/yarn.lock

Original file line number	Diff line number	Diff line change
`@@ -639,7 +639,6 @@ const SearchDisplay: React.FC<SearchDisplayProps> = ({`
`639`	`639`	`min_score: number`
`640`	`640`	`max_incompleteness_penalty: number`
`641`	`641`	`content_file_score_weight: number`
`642`		`- use_hybrid_search: string`
`643`	`642`	`}`
`644`	`643`
`645`	`644`	`const AdminOptions = (`