Skip to content

Commit d537e3b

Browse files
committed
commit
1 parent 45ef729 commit d537e3b

File tree

12 files changed

+442
-239
lines changed

12 files changed

+442
-239
lines changed

frontends/api/src/generated/v1/api.ts

Lines changed: 33 additions & 102 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

frontends/main/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
"@ebay/nice-modal-react": "^1.2.13",
1414
"@emotion/cache": "^11.13.1",
1515
"@emotion/styled": "^11.11.0",
16-
"@mitodl/course-search-utils": "git+https://github.com/mitodl/course-search-utils.git#ab/add-hybrid-search-flag",
16+
"@mitodl/course-search-utils": "^3.4.1",
1717
"@mitodl/mitxonline-api-axios": "^2025.10.21",
1818
"@mitodl/smoot-design": "^6.17.1",
1919
"@next/bundle-analyzer": "^14.2.15",
@@ -37,6 +37,7 @@
3737
"react": "^19.0.0",
3838
"react-dom": "^19.0.0",
3939
"react-slick": "^0.30.2",
40+
"sharp": "0.34.4",
4041
"slick-carousel": "^1.8.1",
4142
"tiny-invariant": "^1.3.3",
4243
"yup": "^1.4.0"

frontends/main/src/app-pages/SearchPage/SearchPage.tsx

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,9 +115,6 @@ const SearchPage: React.FC = () => {
115115
onFacetsChange,
116116
})
117117

118-
console.log(params)
119-
console.log(searchParams)
120-
121118
const page = +(searchParams.get("page") ?? "1")
122119

123120
useEffect(() => {

frontends/main/src/page-components/SearchDisplay/SearchDisplay.tsx

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -639,7 +639,6 @@ const SearchDisplay: React.FC<SearchDisplayProps> = ({
639639
min_score: number
640640
max_incompleteness_penalty: number
641641
content_file_score_weight: number
642-
use_hybrid_search: string
643642
}
644643

645644
const AdminOptions = (

frontends/main/src/page-components/SearchDisplay/getSearchParams.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import type {
88
Facets,
99
BooleanFacets,
1010
} from "@mitodl/course-search-utils"
11-
import { use } from "react"
1211

1312
export const PAGE_SIZE = 20
1413

@@ -40,7 +39,6 @@ const getSearchParams = ({
4039
max_incompleteness_penalty: searchParams.get("max_incompleteness_penalty"),
4140
content_file_score_weight: searchParams.get("content_file_score_weight"),
4241
resource_category: resourceCategory ? [resourceCategory] : null,
43-
use_hybrid_search: searchParams.get("use_hybrid_search"),
4442
aggregations: [...(facetNames || []), "resource_category"],
4543
...requestParams,
4644
offset: (Number(page) - 1) * pageSize,

learning_resources_search/api.py

Lines changed: 40 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,15 @@
1313
from learning_resources.models import LearningResource
1414
from learning_resources_search.connection import (
1515
get_default_alias_name,
16+
get_vector_model_id,
1617
)
1718
from learning_resources_search.constants import (
1819
COMBINED_INDEX,
1920
CONTENT_FILE_TYPE,
2021
COURSE_QUERY_FIELDS,
2122
COURSE_TYPE,
2223
DEPARTMENT_QUERY_FIELDS,
24+
HYBRID_SEARCH_MODE,
2325
LEARNING_RESOURCE,
2426
LEARNING_RESOURCE_QUERY_FIELDS,
2527
LEARNING_RESOURCE_SEARCH_SORTBY_OPTIONS,
@@ -63,7 +65,7 @@ def gen_content_file_id(content_file_id):
6365
return f"cf_{content_file_id}"
6466

6567

66-
def relevant_indexes(resource_types, aggregations, endpoint, use_hybrid_search=False):
68+
def relevant_indexes(resource_types, aggregations, endpoint, use_hybrid_search):
6769
"""
6870
Return list of relevent index type for the query
6971
@@ -143,7 +145,11 @@ def generate_sort_clause(search_params):
143145
return sort
144146

145147

146-
def wrap_text_clause(text_query, min_score=None, use_hybrid_search=False):
148+
def wrap_text_clause(
149+
text_query,
150+
use_hybrid_search,
151+
min_score=None,
152+
):
147153
"""
148154
Wrap the text subqueries in a bool query
149155
Shared by generate_content_file_text_clause and
@@ -209,11 +215,11 @@ def generate_content_file_text_clause(text):
209215
else:
210216
text_query = {}
211217

212-
return wrap_text_clause(text_query)
218+
return wrap_text_clause(text_query, use_hybrid_search=False)
213219

214220

215221
def generate_learning_resources_text_clause(
216-
text, search_mode, slop, content_file_score_weight, min_score, use_hybrid_search
222+
text, search_mode, slop, content_file_score_weight, min_score
217223
):
218224
"""
219225
Return text clause for the query
@@ -224,16 +230,23 @@ def generate_learning_resources_text_clause(
224230
dict: dictionary with the opensearch text clause
225231
"""
226232

233+
use_hybrid_search = search_mode == HYBRID_SEARCH_MODE
234+
227235
query_type = (
228236
"query_string" if text.startswith('"') and text.endswith('"') else "multi_match"
229237
)
230238

231239
extra_params = {}
232240

233-
if query_type == "multi_match" and search_mode:
234-
extra_params["type"] = search_mode
241+
if use_hybrid_search:
242+
text_search_mode = settings.DEFAULT_SEARCH_MODE
243+
else:
244+
text_search_mode = search_mode
245+
246+
if query_type == "multi_match":
247+
extra_params["type"] = text_search_mode
235248

236-
if search_mode == "phrase" and slop:
249+
if text_search_mode == "phrase" and slop:
237250
extra_params["slop"] = slop
238251

239252
if content_file_score_weight is not None:
@@ -337,7 +350,7 @@ def generate_learning_resources_text_clause(
337350
else:
338351
text_query = {}
339352

340-
return wrap_text_clause(text_query, min_score, use_hybrid_search)
353+
return wrap_text_clause(text_query, use_hybrid_search, min_score)
341354

342355

343356
def generate_filter_clause(
@@ -587,7 +600,6 @@ def add_text_query_to_search(
587600
search_params.get("slop"),
588601
search_params.get("content_file_score_weight"),
589602
search_params.get("min_score"),
590-
use_hybrid_search,
591603
)
592604

593605
yearly_decay_percent = search_params.get("yearly_decay_percent")
@@ -637,11 +649,17 @@ def add_text_query_to_search(
637649
text_query = {"bool": {"must": [text_query], "filter": query_type_query}}
638650

639651
if use_hybrid_search:
652+
vector_model_id = get_vector_model_id()
653+
if not vector_model_id:
654+
log.error("Vector model not found. Cannot perform hybrid search.")
655+
error_message = "Vector model not found."
656+
raise ValueError(error_message)
657+
640658
vector_query_description = {
641659
"neural": {
642660
"description_embedding": {
643661
"query_text": text,
644-
"model_id": "PQBFF5oBDk6_T5cL_Izk",
662+
"model_id": vector_model_id,
645663
"min_score": 0.015,
646664
},
647665
}
@@ -651,7 +669,7 @@ def add_text_query_to_search(
651669
"neural": {
652670
"title_embedding": {
653671
"query_text": text,
654-
"model_id": "PQBFF5oBDk6_T5cL_Izk",
672+
"model_id": vector_model_id,
655673
"min_score": 0.015,
656674
},
657675
}
@@ -675,7 +693,7 @@ def add_text_query_to_search(
675693
return search
676694

677695

678-
def construct_search(search_params):
696+
def construct_search(search_params): # noqa: C901
679697
"""
680698
Construct a learning resources search based on the query
681699
@@ -694,7 +712,7 @@ def construct_search(search_params):
694712
):
695713
search_params["resource_type"] = list(LEARNING_RESOURCE_TYPES)
696714

697-
use_hybrid_search = search_params.get("use_hybrid_search", False)
715+
use_hybrid_search = search_params.get("search_mode") == HYBRID_SEARCH_MODE
698716

699717
indexes = relevant_indexes(
700718
search_params.get("resource_type"),
@@ -706,7 +724,7 @@ def construct_search(search_params):
706724
search = Search(index=",".join(indexes))
707725

708726
search = search.source(fields={"excludes": SOURCE_EXCLUDED_FIELDS})
709-
if not search_params.get("use_hybrid_search"):
727+
if not use_hybrid_search:
710728
search = search.params(search_type="dfs_query_then_fetch")
711729
if search_params.get("offset"):
712730
search = search.extra(from_=search_params.get("offset"))
@@ -763,12 +781,12 @@ def execute_learn_search(search_params):
763781
Returns:
764782
dict: The opensearch response dict
765783
"""
766-
print(search_params)
767784
if search_params.get("endpoint") != CONTENT_FILE_TYPE:
768785
if search_params.get("yearly_decay_percent") is None:
769786
search_params["yearly_decay_percent"] = (
770787
settings.DEFAULT_SEARCH_STALENESS_PENALTY
771788
)
789+
772790
if search_params.get("search_mode") is None:
773791
search_params["search_mode"] = settings.DEFAULT_SEARCH_MODE
774792
if search_params.get("slop") is None:
@@ -781,7 +799,7 @@ def execute_learn_search(search_params):
781799
)
782800
search = construct_search(search_params)
783801

784-
if search_params.get("use_hybrid_search"):
802+
if search_params.get("search_mode") == HYBRID_SEARCH_MODE:
785803
search = search.extra(
786804
search_pipeline={
787805
"description": "Post processor for hybrid search",
@@ -799,7 +817,6 @@ def execute_learn_search(search_params):
799817
}
800818
)
801819

802-
print(search.to_dict())
803820
return search.execute().to_dict()
804821

805822

@@ -915,7 +932,9 @@ def get_similar_topics(
915932
list of str:
916933
list of topic values
917934
"""
918-
indexes = relevant_indexes([COURSE_TYPE], [], endpoint=LEARNING_RESOURCE)
935+
indexes = relevant_indexes(
936+
[COURSE_TYPE], [], endpoint=LEARNING_RESOURCE, use_hybrid_search=False
937+
)
919938
search = Search(index=",".join(indexes))
920939
search = search.filter("term", resource_type=COURSE_TYPE)
921940
search = search.query(
@@ -1053,7 +1072,9 @@ def get_similar_resources_opensearch(
10531072
list of str:
10541073
list of learning resources
10551074
"""
1056-
indexes = relevant_indexes(LEARNING_RESOURCE_TYPES, [], endpoint=LEARNING_RESOURCE)
1075+
indexes = relevant_indexes(
1076+
LEARNING_RESOURCE_TYPES, [], endpoint=LEARNING_RESOURCE, use_hybrid_search=False
1077+
)
10571078
search = Search(index=",".join(indexes))
10581079
if num_resources:
10591080
# adding +1 to num_resources since we filter out existing resource.id

learning_resources_search/connection.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,3 +135,17 @@ def refresh_index(index):
135135
"""
136136
conn = get_conn()
137137
conn.indices.refresh(index)
138+
139+
140+
def get_vector_model_id():
141+
conn = get_conn()
142+
model_name = "huggingface/sentence-transformers/msmarco-distilbert-base-tas-b"
143+
body = {"query": {"term": {"name.keyword": model_name}}}
144+
models = conn.transport.perform_request(
145+
"GET", "/_plugins/_ml/models/_search", body=body
146+
)
147+
148+
if len(models.get("hits", {}).get("hits", [])) > 0:
149+
return models["hits"]["hits"][0]["_source"]["model_id"]
150+
151+
return None

learning_resources_search/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
COMBINED_INDEX = "combined_hybrid"
2626

2727
LEARNING_RESOURCE = "learning_resource"
28+
HYBRID_SEARCH_MODE = "hybrid"
2829

2930

3031
class IndexestoUpdate(Enum):

learning_resources_search/indexing_api.py

Lines changed: 11 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from django.conf import settings
1010
from django.contrib.auth import get_user_model
11+
from opensearch_py_ml.ml_commons import MLCommonClient
1112
from opensearchpy.exceptions import ConflictError, NotFoundError
1213
from opensearchpy.helpers import BulkIndexError, bulk
1314

@@ -17,6 +18,7 @@
1718
get_conn,
1819
get_default_alias_name,
1920
get_reindexing_alias_name,
21+
get_vector_model_id,
2022
make_backing_index_name,
2123
refresh_index,
2224
)
@@ -43,7 +45,6 @@
4345

4446
log = logging.getLogger(__name__)
4547
User = get_user_model()
46-
from opensearch_py_ml.ml_commons import MLCommonClient
4748

4849

4950
def clear_featured_rank(rank, clear_all_greater_than):
@@ -636,7 +637,7 @@ def get_existing_reindexing_indexes(obj_types):
636637
return reindexing_indexes
637638

638639

639-
def update_index_settings():
640+
def update_local_index_settings_for_hybrid_search():
640641
settings_body = {
641642
"persistent": {
642643
"plugins": {
@@ -654,32 +655,32 @@ def update_index_settings():
654655

655656
def get_ml_client():
656657
conn = get_conn()
657-
ml_client = MLCommonClient(conn)
658-
return ml_client
658+
return MLCommonClient(conn)
659659

660660

661661
def register_model():
662662
ml_client = get_ml_client()
663-
model_id = ml_client.register_pretrained_model(
663+
ml_client.register_pretrained_model(
664664
model_name="huggingface/sentence-transformers/msmarco-distilbert-base-tas-b",
665665
model_version="1.0.3",
666666
model_format="TORCH_SCRIPT",
667667
deploy_model=True,
668668
)
669669

670670

671-
# In [11]: model_id
672-
# Out[49]: 'PQBFF5oBDk6_T5cL_Izk'
673-
674-
675671
def create_ingest_pipeline():
676672
conn = get_conn()
673+
model_id = get_vector_model_id()
674+
if not model_id:
675+
log.error("Model not found. Cannot create ingest pipeline.")
676+
return
677+
677678
pipeline = {
678679
"description": "An NLP ingest pipeline",
679680
"processors": [
680681
{
681682
"text_embedding": {
682-
"model_id": "PQBFF5oBDk6_T5cL_Izk",
683+
"model_id": model_id,
683684
"field_map": {
684685
"description": "description_embedding",
685686
"title": "title_embedding",
@@ -690,25 +691,3 @@ def create_ingest_pipeline():
690691
}
691692

692693
conn.ingest.put_pipeline("vector_ingest_pipeline", pipeline)
693-
694-
695-
def create_search_pipeline():
696-
conn = get_conn()
697-
pipeline = {
698-
"description": "Post processor for hybrid search",
699-
"phase_results_processors": [
700-
{
701-
"normalization-processor": {
702-
"normalization": {"technique": "min_max"},
703-
"combination": {
704-
"technique": "arithmetic_mean",
705-
"parameters": {"weights": [0.7, 0.3]},
706-
},
707-
}
708-
}
709-
],
710-
}
711-
712-
conn.transport.perform_request(
713-
"PUT", "/_search/pipeline/hybrid_search_pipeline", body=pipeline
714-
)

0 commit comments

Comments
 (0)