Skip to content

Commit afd8ad8

Browse files
committed
commit
1 parent a08ddc2 commit afd8ad8

File tree

10 files changed

+132
-222
lines changed

10 files changed

+132
-222
lines changed

frontends/api/src/generated/v1/api.ts

Lines changed: 33 additions & 102 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

frontends/main/src/app-pages/SearchPage/SearchPage.tsx

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,9 +115,6 @@ const SearchPage: React.FC = () => {
115115
onFacetsChange,
116116
})
117117

118-
console.log(params)
119-
console.log(searchParams)
120-
121118
const page = +(searchParams.get("page") ?? "1")
122119

123120
useEffect(() => {

frontends/main/src/page-components/SearchDisplay/SearchDisplay.tsx

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -639,7 +639,6 @@ const SearchDisplay: React.FC<SearchDisplayProps> = ({
639639
min_score: number
640640
max_incompleteness_penalty: number
641641
content_file_score_weight: number
642-
use_hybrid_search: string
643642
}
644643

645644
const AdminOptions = (

frontends/main/src/page-components/SearchDisplay/getSearchParams.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import type {
88
Facets,
99
BooleanFacets,
1010
} from "@mitodl/course-search-utils"
11-
import { use } from "react"
1211

1312
export const PAGE_SIZE = 20
1413

@@ -40,7 +39,6 @@ const getSearchParams = ({
4039
max_incompleteness_penalty: searchParams.get("max_incompleteness_penalty"),
4140
content_file_score_weight: searchParams.get("content_file_score_weight"),
4241
resource_category: resourceCategory ? [resourceCategory] : null,
43-
use_hybrid_search: searchParams.get("use_hybrid_search"),
4442
aggregations: [...(facetNames || []), "resource_category"],
4543
...requestParams,
4644
offset: (Number(page) - 1) * pageSize,

learning_resources_search/api.py

Lines changed: 40 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,15 @@
1313
from learning_resources.models import LearningResource
1414
from learning_resources_search.connection import (
1515
get_default_alias_name,
16+
get_vector_model_id,
1617
)
1718
from learning_resources_search.constants import (
1819
COMBINED_INDEX,
1920
CONTENT_FILE_TYPE,
2021
COURSE_QUERY_FIELDS,
2122
COURSE_TYPE,
2223
DEPARTMENT_QUERY_FIELDS,
24+
HYBRID_SEARCH_MODE,
2325
LEARNING_RESOURCE,
2426
LEARNING_RESOURCE_QUERY_FIELDS,
2527
LEARNING_RESOURCE_SEARCH_SORTBY_OPTIONS,
@@ -67,7 +69,7 @@ def gen_content_file_id(content_file_id):
6769
return f"cf_{content_file_id}"
6870

6971

70-
def relevant_indexes(resource_types, aggregations, endpoint, use_hybrid_search=False):
72+
def relevant_indexes(resource_types, aggregations, endpoint, use_hybrid_search):
7173
"""
7274
Return list of relevent index type for the query
7375
@@ -147,7 +149,11 @@ def generate_sort_clause(search_params):
147149
return sort
148150

149151

150-
def wrap_text_clause(text_query, min_score=None, use_hybrid_search=False):
152+
def wrap_text_clause(
153+
text_query,
154+
use_hybrid_search,
155+
min_score=None,
156+
):
151157
"""
152158
Wrap the text subqueries in a bool query
153159
Shared by generate_content_file_text_clause and
@@ -213,11 +219,11 @@ def generate_content_file_text_clause(text):
213219
else:
214220
text_query = {}
215221

216-
return wrap_text_clause(text_query)
222+
return wrap_text_clause(text_query, use_hybrid_search=False)
217223

218224

219225
def generate_learning_resources_text_clause(
220-
text, search_mode, slop, content_file_score_weight, min_score, use_hybrid_search
226+
text, search_mode, slop, content_file_score_weight, min_score
221227
):
222228
"""
223229
Return text clause for the query
@@ -228,16 +234,23 @@ def generate_learning_resources_text_clause(
228234
dict: dictionary with the opensearch text clause
229235
"""
230236

237+
use_hybrid_search = search_mode == HYBRID_SEARCH_MODE
238+
231239
query_type = (
232240
"query_string" if text.startswith('"') and text.endswith('"') else "multi_match"
233241
)
234242

235243
extra_params = {}
236244

237-
if query_type == "multi_match" and search_mode:
238-
extra_params["type"] = search_mode
245+
if use_hybrid_search:
246+
text_search_mode = settings.DEFAULT_SEARCH_MODE
247+
else:
248+
text_search_mode = search_mode
249+
250+
if query_type == "multi_match":
251+
extra_params["type"] = text_search_mode
239252

240-
if search_mode == "phrase" and slop:
253+
if text_search_mode == "phrase" and slop:
241254
extra_params["slop"] = slop
242255

243256
if content_file_score_weight is not None:
@@ -341,7 +354,7 @@ def generate_learning_resources_text_clause(
341354
else:
342355
text_query = {}
343356

344-
return wrap_text_clause(text_query, min_score, use_hybrid_search)
357+
return wrap_text_clause(text_query, use_hybrid_search, min_score)
345358

346359

347360
def generate_filter_clause(
@@ -591,7 +604,6 @@ def add_text_query_to_search(
591604
search_params.get("slop"),
592605
search_params.get("content_file_score_weight"),
593606
search_params.get("min_score"),
594-
use_hybrid_search,
595607
)
596608

597609
yearly_decay_percent = search_params.get("yearly_decay_percent")
@@ -641,11 +653,17 @@ def add_text_query_to_search(
641653
text_query = {"bool": {"must": [text_query], "filter": query_type_query}}
642654

643655
if use_hybrid_search:
656+
vector_model_id = get_vector_model_id()
657+
if not vector_model_id:
658+
log.error("Vector model not found. Cannot perform hybrid search.")
659+
error_message = "Vector model not found."
660+
raise ValueError(error_message)
661+
644662
vector_query_description = {
645663
"neural": {
646664
"description_embedding": {
647665
"query_text": text,
648-
"model_id": "PQBFF5oBDk6_T5cL_Izk",
666+
"model_id": vector_model_id,
649667
"min_score": 0.015,
650668
},
651669
}
@@ -655,7 +673,7 @@ def add_text_query_to_search(
655673
"neural": {
656674
"title_embedding": {
657675
"query_text": text,
658-
"model_id": "PQBFF5oBDk6_T5cL_Izk",
676+
"model_id": vector_model_id,
659677
"min_score": 0.015,
660678
},
661679
}
@@ -679,7 +697,7 @@ def add_text_query_to_search(
679697
return search
680698

681699

682-
def construct_search(search_params):
700+
def construct_search(search_params): # noqa: C901
683701
"""
684702
Construct a learning resources search based on the query
685703
@@ -698,7 +716,7 @@ def construct_search(search_params):
698716
):
699717
search_params["resource_type"] = list(LEARNING_RESOURCE_TYPES)
700718

701-
use_hybrid_search = search_params.get("use_hybrid_search", False)
719+
use_hybrid_search = search_params.get("search_mode") == HYBRID_SEARCH_MODE
702720

703721
indexes = relevant_indexes(
704722
search_params.get("resource_type"),
@@ -710,7 +728,7 @@ def construct_search(search_params):
710728
search = Search(index=",".join(indexes))
711729

712730
search = search.source(fields={"excludes": SOURCE_EXCLUDED_FIELDS})
713-
if not search_params.get("use_hybrid_search"):
731+
if not use_hybrid_search:
714732
search = search.params(search_type="dfs_query_then_fetch")
715733
if search_params.get("offset"):
716734
search = search.extra(from_=search_params.get("offset"))
@@ -767,12 +785,12 @@ def execute_learn_search(search_params):
767785
Returns:
768786
dict: The opensearch response dict
769787
"""
770-
print(search_params)
771788
if search_params.get("endpoint") != CONTENT_FILE_TYPE:
772789
if search_params.get("yearly_decay_percent") is None:
773790
search_params["yearly_decay_percent"] = (
774791
settings.DEFAULT_SEARCH_STALENESS_PENALTY
775792
)
793+
776794
if search_params.get("search_mode") is None:
777795
search_params["search_mode"] = settings.DEFAULT_SEARCH_MODE
778796
if search_params.get("slop") is None:
@@ -785,7 +803,7 @@ def execute_learn_search(search_params):
785803
)
786804
search = construct_search(search_params)
787805

788-
if search_params.get("use_hybrid_search"):
806+
if search_params.get("search_mode") == HYBRID_SEARCH_MODE:
789807
search = search.extra(
790808
search_pipeline={
791809
"description": "Post processor for hybrid search",
@@ -803,7 +821,6 @@ def execute_learn_search(search_params):
803821
}
804822
)
805823

806-
print(search.to_dict())
807824
return search.execute().to_dict()
808825

809826

@@ -964,7 +981,9 @@ def get_similar_topics(
964981
list of str:
965982
list of topic values
966983
"""
967-
indexes = relevant_indexes([COURSE_TYPE], [], endpoint=LEARNING_RESOURCE)
984+
indexes = relevant_indexes(
985+
[COURSE_TYPE], [], endpoint=LEARNING_RESOURCE, use_hybrid_search=False
986+
)
968987
search = Search(index=",".join(indexes))
969988
search = search.filter("term", resource_type=COURSE_TYPE)
970989
search = search.query(
@@ -1111,7 +1130,9 @@ def get_similar_resources_opensearch(
11111130
list of str:
11121131
list of learning resources
11131132
"""
1114-
indexes = relevant_indexes(LEARNING_RESOURCE_TYPES, [], endpoint=LEARNING_RESOURCE)
1133+
indexes = relevant_indexes(
1134+
LEARNING_RESOURCE_TYPES, [], endpoint=LEARNING_RESOURCE, use_hybrid_search=False
1135+
)
11151136
search = Search(index=",".join(indexes))
11161137
if num_resources:
11171138
# adding +1 to num_resources since we filter out existing resource.id

learning_resources_search/connection.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,3 +135,17 @@ def refresh_index(index):
135135
"""
136136
conn = get_conn()
137137
conn.indices.refresh(index)
138+
139+
140+
def get_vector_model_id():
141+
conn = get_conn()
142+
model_name = "huggingface/sentence-transformers/msmarco-distilbert-base-tas-b"
143+
body = {"query": {"term": {"name.keyword": model_name}}}
144+
models = conn.transport.perform_request(
145+
"GET", "/_plugins/_ml/models/_search", body=body
146+
)
147+
148+
if len(models.get("hits", {}).get("hits", [])) > 0:
149+
return models["hits"]["hits"][0]["_source"]["model_id"]
150+
151+
return None

learning_resources_search/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
COMBINED_INDEX = "combined_hybrid"
2626

2727
LEARNING_RESOURCE = "learning_resource"
28+
HYBRID_SEARCH_MODE = "hybrid"
2829

2930

3031
class IndexestoUpdate(Enum):

learning_resources_search/indexing_api.py

Lines changed: 11 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from django.conf import settings
1010
from django.contrib.auth import get_user_model
11+
from opensearch_py_ml.ml_commons import MLCommonClient
1112
from opensearchpy.exceptions import ConflictError, NotFoundError
1213
from opensearchpy.helpers import BulkIndexError, bulk
1314

@@ -17,6 +18,7 @@
1718
get_conn,
1819
get_default_alias_name,
1920
get_reindexing_alias_name,
21+
get_vector_model_id,
2022
make_backing_index_name,
2123
refresh_index,
2224
)
@@ -43,7 +45,6 @@
4345

4446
log = logging.getLogger(__name__)
4547
User = get_user_model()
46-
from opensearch_py_ml.ml_commons import MLCommonClient
4748

4849

4950
def clear_featured_rank(rank, clear_all_greater_than):
@@ -636,7 +637,7 @@ def get_existing_reindexing_indexes(obj_types):
636637
return reindexing_indexes
637638

638639

639-
def update_index_settings():
640+
def update_local_index_settings_for_hybrid_search():
640641
settings_body = {
641642
"persistent": {
642643
"plugins": {
@@ -654,32 +655,32 @@ def update_index_settings():
654655

655656
def get_ml_client():
656657
conn = get_conn()
657-
ml_client = MLCommonClient(conn)
658-
return ml_client
658+
return MLCommonClient(conn)
659659

660660

661661
def register_model():
662662
ml_client = get_ml_client()
663-
model_id = ml_client.register_pretrained_model(
663+
ml_client.register_pretrained_model(
664664
model_name="huggingface/sentence-transformers/msmarco-distilbert-base-tas-b",
665665
model_version="1.0.3",
666666
model_format="TORCH_SCRIPT",
667667
deploy_model=True,
668668
)
669669

670670

671-
# In [11]: model_id
672-
# Out[49]: 'PQBFF5oBDk6_T5cL_Izk'
673-
674-
675671
def create_ingest_pipeline():
676672
conn = get_conn()
673+
model_id = get_vector_model_id()
674+
if not model_id:
675+
log.error("Model not found. Cannot create ingest pipeline.")
676+
return
677+
677678
pipeline = {
678679
"description": "An NLP ingest pipeline",
679680
"processors": [
680681
{
681682
"text_embedding": {
682-
"model_id": "PQBFF5oBDk6_T5cL_Izk",
683+
"model_id": model_id,
683684
"field_map": {
684685
"description": "description_embedding",
685686
"title": "title_embedding",
@@ -690,25 +691,3 @@ def create_ingest_pipeline():
690691
}
691692

692693
conn.ingest.put_pipeline("vector_ingest_pipeline", pipeline)
693-
694-
695-
def create_search_pipeline():
696-
conn = get_conn()
697-
pipeline = {
698-
"description": "Post processor for hybrid search",
699-
"phase_results_processors": [
700-
{
701-
"normalization-processor": {
702-
"normalization": {"technique": "min_max"},
703-
"combination": {
704-
"technique": "arithmetic_mean",
705-
"parameters": {"weights": [0.7, 0.3]},
706-
},
707-
}
708-
}
709-
],
710-
}
711-
712-
conn.transport.perform_request(
713-
"PUT", "/_search/pipeline/hybrid_search_pipeline", body=pipeline
714-
)

learning_resources_search/serializers.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -424,9 +424,10 @@ class LearningResourcesSearchRequestSerializer(SearchRequestSerializer):
424424
),
425425
)
426426
search_mode_choices = [
427+
("phrase", "phrase"),
427428
("best_fields", "best_fields"),
428429
("most_fields", "most_fields"),
429-
("phrase", "phrase"),
430+
("hybrid", "hybrid"),
430431
]
431432
search_mode = serializers.ChoiceField(
432433
required=False,
@@ -473,12 +474,6 @@ class LearningResourcesSearchRequestSerializer(SearchRequestSerializer):
473474
" 0 means content files are ignored"
474475
),
475476
)
476-
use_hybrid_search = serializers.BooleanField(
477-
required=False,
478-
allow_null=True,
479-
default=False,
480-
help_text="If true, use hybrid search combining vector and keyword search",
481-
)
482477

483478

484479
class ContentFileSearchRequestSerializer(SearchRequestSerializer):

0 commit comments

Comments
 (0)