88#
99
1010
11+ import difflib
1112import json
13+ from itertools import combinations
1214
1315from aboutcode .pipeline import LoopProgress
1416from django .utils import timezone
2022from vulnerabilities .pipelines import VulnerableCodePipeline
2123from vulnerabilities .pipes .advisory import advisories_checksum
2224
25+ SUMMARY_SIMILARITY_THRESHOLD = 0.8
26+
2327
2428class ComputeToDo (VulnerableCodePipeline ):
2529 """Compute ToDos for Advisory."""
@@ -31,6 +35,8 @@ def steps(cls):
3135 return (
3236 cls .compute_individual_advisory_todo ,
3337 cls .detect_conflicting_advisories ,
38+ cls .relate_advisories_by_aliases ,
39+ cls .detect_similar_summaries ,
3440 )
3541
3642 def compute_individual_advisory_todo (self ):
@@ -144,6 +150,115 @@ def detect_conflicting_advisories(self):
144150 f"Successfully created { new_todos_count } ToDos for conflicting affected and fixed packages"
145151 )
146152
153+ def relate_advisories_by_aliases (self ):
154+ """
155+ Create ToDos for advisories from different datasources that share the same alias.
156+ """
157+ aliases = AdvisoryAlias .objects .prefetch_related ("advisories" )
158+ aliases_count = aliases .count ()
159+ advisory_relation_to_create = {}
160+ todo_to_create = []
161+ new_todos_count = 0
162+ batch_size = 5000
163+
164+ self .log (f"Checking alias-based relations across { aliases_count } aliases" )
165+
166+ progress = LoopProgress (
167+ total_iterations = aliases_count ,
168+ logger = self .log ,
169+ progress_step = 1 ,
170+ )
171+ for alias in progress .iter (aliases .iterator (chunk_size = 2000 )):
172+ advisories = list (
173+ alias .advisories .values ("id" , "datasource_id" , "unique_content_id" )
174+ )
175+
176+ datasources = {a ["datasource_id" ] for a in advisories }
177+ if len (datasources ) < 2 :
178+ continue
179+
180+ advisory_objs = list (alias .advisories .all ())
181+ check_potentially_related_by_aliases (
182+ advisories = advisory_objs ,
183+ alias = alias ,
184+ todo_to_create = todo_to_create ,
185+ advisory_relation_to_create = advisory_relation_to_create ,
186+ )
187+
188+ if len (todo_to_create ) > batch_size :
189+ new_todos_count += bulk_create_with_m2m (
190+ todos = todo_to_create ,
191+ advisories = advisory_relation_to_create ,
192+ logger = self .log ,
193+ )
194+ advisory_relation_to_create .clear ()
195+ todo_to_create .clear ()
196+
197+ new_todos_count += bulk_create_with_m2m (
198+ todos = todo_to_create ,
199+ advisories = advisory_relation_to_create ,
200+ logger = self .log ,
201+ )
202+
203+ self .log (
204+ f"Successfully created { new_todos_count } ToDos for potentially related advisories by aliases"
205+ )
206+
207+ def detect_similar_summaries (self ):
208+ """
209+ Create ToDos for advisories from different datasources that share the same alias
210+ and have summaries with similarity above SUMMARY_SIMILARITY_THRESHOLD.
211+ """
212+ aliases = AdvisoryAlias .objects .prefetch_related ("advisories" )
213+ aliases_count = aliases .count ()
214+ advisory_relation_to_create = {}
215+ todo_to_create = []
216+ new_todos_count = 0
217+ batch_size = 5000
218+
219+ self .log (f"Checking summary similarity across { aliases_count } aliases" )
220+
221+ progress = LoopProgress (
222+ total_iterations = aliases_count ,
223+ logger = self .log ,
224+ progress_step = 1 ,
225+ )
226+ for alias in progress .iter (aliases .iterator (chunk_size = 2000 )):
227+ advisory_objs = list (
228+ alias .advisories .exclude (summary = "" ).only (
229+ "id" , "datasource_id" , "summary" , "unique_content_id"
230+ )
231+ )
232+
233+ datasources = {a .datasource_id for a in advisory_objs }
234+ if len (datasources ) < 2 :
235+ continue
236+
237+ check_similar_summaries (
238+ advisories = advisory_objs ,
239+ todo_to_create = todo_to_create ,
240+ advisory_relation_to_create = advisory_relation_to_create ,
241+ )
242+
243+ if len (todo_to_create ) > batch_size :
244+ new_todos_count += bulk_create_with_m2m (
245+ todos = todo_to_create ,
246+ advisories = advisory_relation_to_create ,
247+ logger = self .log ,
248+ )
249+ advisory_relation_to_create .clear ()
250+ todo_to_create .clear ()
251+
252+ new_todos_count += bulk_create_with_m2m (
253+ todos = todo_to_create ,
254+ advisories = advisory_relation_to_create ,
255+ logger = self .log ,
256+ )
257+
258+ self .log (
259+ f"Successfully created { new_todos_count } ToDos for advisories with similar summaries"
260+ )
261+
147262
148263def check_missing_summary (
149264 advisory : AdvisoryV2 ,
@@ -351,3 +466,60 @@ def bulk_create_with_m2m(todos, advisories, logger):
351466 logger (f"Error creating Advisory ToDo relations: { e } " )
352467
353468 return new_todos .count ()
469+
470+
471+ def check_potentially_related_by_aliases (
472+ advisories ,
473+ alias ,
474+ todo_to_create ,
475+ advisory_relation_to_create ,
476+ ):
477+ """
478+ Create a POTENTIALLY_RELATED_BY_ALIASES ToDo for advisories from different
479+ datasources that share the same alias.
480+ """
481+ todo_id = advisories_checksum (advisories )
482+ todo = AdvisoryToDoV2 (
483+ related_advisories_id = todo_id ,
484+ issue_type = "POTENTIALLY_RELATED_BY_ALIASES" ,
485+ issue_detail = json .dumps ({"shared_alias" : str (alias )}),
486+ )
487+ todo_to_create .append (todo )
488+ advisory_relation_to_create [todo_id ] = advisories
489+
490+
491+ def check_similar_summaries (
492+ advisories ,
493+ todo_to_create ,
494+ advisory_relation_to_create ,
495+ ):
496+ """
497+ Create SIMILAR_SUMMARIES ToDos for pairs of advisories from different datasources
498+ whose summaries have a similarity ratio above SUMMARY_SIMILARITY_THRESHOLD.
499+ """
500+ for advisory_a , advisory_b in combinations (advisories , 2 ):
501+ if advisory_a .datasource_id == advisory_b .datasource_id :
502+ continue
503+
504+ ratio = difflib .SequenceMatcher (
505+ None , advisory_a .summary , advisory_b .summary
506+ ).ratio ()
507+
508+ if ratio < SUMMARY_SIMILARITY_THRESHOLD :
509+ continue
510+
511+ pair = [advisory_a , advisory_b ]
512+ todo_id = advisories_checksum (pair )
513+ todo = AdvisoryToDoV2 (
514+ related_advisories_id = todo_id ,
515+ issue_type = "SIMILAR_SUMMARIES" ,
516+ issue_detail = json .dumps (
517+ {
518+ "similarity_score" : round (ratio , 4 ),
519+ "datasource_a" : advisory_a .datasource_id ,
520+ "datasource_b" : advisory_b .datasource_id ,
521+ }
522+ ),
523+ )
524+ todo_to_create .append (todo )
525+ advisory_relation_to_create [todo_id ] = pair
0 commit comments