Skip to content

Commit b3a2b09

Browse files
committed
feat: add alias-based relating and summary similarity heuristics for advisory grouping
Signed-off-by: shivamshrma09 <shivamsharma27107@gmail.com>
1 parent c8f0a89 commit b3a2b09

File tree

4 files changed

+388
-0
lines changed

4 files changed

+388
-0
lines changed
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# VulnerableCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
from django.db import migrations
11+
from django.db import models
12+
import vulnerabilities.models
13+
14+
15+
class Migration(migrations.Migration):
16+
17+
dependencies = [
18+
("vulnerabilities", "0116_advisoryv2_advisory_content_hash"),
19+
]
20+
21+
operations = [
22+
migrations.AlterField(
23+
model_name="advisorytodo",
24+
name="issue_type",
25+
field=models.CharField(
26+
choices=vulnerabilities.models.ISSUE_TYPE_CHOICES,
27+
db_index=True,
28+
help_text="Select the issue that needs to be addressed from the available options.",
29+
max_length=50,
30+
),
31+
),
32+
migrations.AlterField(
33+
model_name="advisorytodov2",
34+
name="issue_type",
35+
field=models.CharField(
36+
choices=vulnerabilities.models.ISSUE_TYPE_CHOICES,
37+
db_index=True,
38+
help_text="Select the issue that needs to be addressed from the available options.",
39+
max_length=50,
40+
),
41+
),
42+
]

vulnerabilities/models.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2438,6 +2438,11 @@ def create_new_job(self, execute_now=False):
24382438
"Advisories have conflicting affected and fixed-by packages",
24392439
),
24402440
("CONFLICTING_SEVERITY_SCORES", "Advisories have conflicting severity scores"),
2441+
(
2442+
"POTENTIALLY_RELATED_BY_ALIASES",
2443+
"Advisories are potentially related by shared aliases",
2444+
),
2445+
("SIMILAR_SUMMARIES", "Advisories have similar summaries"),
24412446
]
24422447

24432448

vulnerabilities/pipelines/v2_improvers/compute_advisory_todo.py

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
#
99

1010

11+
import difflib
1112
import json
13+
from itertools import combinations
1214

1315
from aboutcode.pipeline import LoopProgress
1416
from django.utils import timezone
@@ -20,6 +22,8 @@
2022
from vulnerabilities.pipelines import VulnerableCodePipeline
2123
from vulnerabilities.pipes.advisory import advisories_checksum
2224

25+
SUMMARY_SIMILARITY_THRESHOLD = 0.8
26+
2327

2428
class ComputeToDo(VulnerableCodePipeline):
2529
"""Compute ToDos for Advisory."""
@@ -31,6 +35,8 @@ def steps(cls):
3135
return (
3236
cls.compute_individual_advisory_todo,
3337
cls.detect_conflicting_advisories,
38+
cls.relate_advisories_by_aliases,
39+
cls.detect_similar_summaries,
3440
)
3541

3642
def compute_individual_advisory_todo(self):
@@ -144,6 +150,115 @@ def detect_conflicting_advisories(self):
144150
f"Successfully created {new_todos_count} ToDos for conflicting affected and fixed packages"
145151
)
146152

153+
def relate_advisories_by_aliases(self):
154+
"""
155+
Create ToDos for advisories from different datasources that share the same alias.
156+
"""
157+
aliases = AdvisoryAlias.objects.prefetch_related("advisories")
158+
aliases_count = aliases.count()
159+
advisory_relation_to_create = {}
160+
todo_to_create = []
161+
new_todos_count = 0
162+
batch_size = 5000
163+
164+
self.log(f"Checking alias-based relations across {aliases_count} aliases")
165+
166+
progress = LoopProgress(
167+
total_iterations=aliases_count,
168+
logger=self.log,
169+
progress_step=1,
170+
)
171+
for alias in progress.iter(aliases.iterator(chunk_size=2000)):
172+
advisories = list(
173+
alias.advisories.values("id", "datasource_id", "unique_content_id")
174+
)
175+
176+
datasources = {a["datasource_id"] for a in advisories}
177+
if len(datasources) < 2:
178+
continue
179+
180+
advisory_objs = list(alias.advisories.all())
181+
check_potentially_related_by_aliases(
182+
advisories=advisory_objs,
183+
alias=alias,
184+
todo_to_create=todo_to_create,
185+
advisory_relation_to_create=advisory_relation_to_create,
186+
)
187+
188+
if len(todo_to_create) > batch_size:
189+
new_todos_count += bulk_create_with_m2m(
190+
todos=todo_to_create,
191+
advisories=advisory_relation_to_create,
192+
logger=self.log,
193+
)
194+
advisory_relation_to_create.clear()
195+
todo_to_create.clear()
196+
197+
new_todos_count += bulk_create_with_m2m(
198+
todos=todo_to_create,
199+
advisories=advisory_relation_to_create,
200+
logger=self.log,
201+
)
202+
203+
self.log(
204+
f"Successfully created {new_todos_count} ToDos for potentially related advisories by aliases"
205+
)
206+
207+
def detect_similar_summaries(self):
208+
"""
209+
Create ToDos for advisories from different datasources that share the same alias
210+
and have summaries with similarity above SUMMARY_SIMILARITY_THRESHOLD.
211+
"""
212+
aliases = AdvisoryAlias.objects.prefetch_related("advisories")
213+
aliases_count = aliases.count()
214+
advisory_relation_to_create = {}
215+
todo_to_create = []
216+
new_todos_count = 0
217+
batch_size = 5000
218+
219+
self.log(f"Checking summary similarity across {aliases_count} aliases")
220+
221+
progress = LoopProgress(
222+
total_iterations=aliases_count,
223+
logger=self.log,
224+
progress_step=1,
225+
)
226+
for alias in progress.iter(aliases.iterator(chunk_size=2000)):
227+
advisory_objs = list(
228+
alias.advisories.exclude(summary="").only(
229+
"id", "datasource_id", "summary", "unique_content_id"
230+
)
231+
)
232+
233+
datasources = {a.datasource_id for a in advisory_objs}
234+
if len(datasources) < 2:
235+
continue
236+
237+
check_similar_summaries(
238+
advisories=advisory_objs,
239+
todo_to_create=todo_to_create,
240+
advisory_relation_to_create=advisory_relation_to_create,
241+
)
242+
243+
if len(todo_to_create) > batch_size:
244+
new_todos_count += bulk_create_with_m2m(
245+
todos=todo_to_create,
246+
advisories=advisory_relation_to_create,
247+
logger=self.log,
248+
)
249+
advisory_relation_to_create.clear()
250+
todo_to_create.clear()
251+
252+
new_todos_count += bulk_create_with_m2m(
253+
todos=todo_to_create,
254+
advisories=advisory_relation_to_create,
255+
logger=self.log,
256+
)
257+
258+
self.log(
259+
f"Successfully created {new_todos_count} ToDos for advisories with similar summaries"
260+
)
261+
147262

148263
def check_missing_summary(
149264
advisory: AdvisoryV2,
@@ -351,3 +466,60 @@ def bulk_create_with_m2m(todos, advisories, logger):
351466
logger(f"Error creating Advisory ToDo relations: {e}")
352467

353468
return new_todos.count()
469+
470+
471+
def check_potentially_related_by_aliases(
472+
advisories,
473+
alias,
474+
todo_to_create,
475+
advisory_relation_to_create,
476+
):
477+
"""
478+
Create a POTENTIALLY_RELATED_BY_ALIASES ToDo for advisories from different
479+
datasources that share the same alias.
480+
"""
481+
todo_id = advisories_checksum(advisories)
482+
todo = AdvisoryToDoV2(
483+
related_advisories_id=todo_id,
484+
issue_type="POTENTIALLY_RELATED_BY_ALIASES",
485+
issue_detail=json.dumps({"shared_alias": str(alias)}),
486+
)
487+
todo_to_create.append(todo)
488+
advisory_relation_to_create[todo_id] = advisories
489+
490+
491+
def check_similar_summaries(
492+
advisories,
493+
todo_to_create,
494+
advisory_relation_to_create,
495+
):
496+
"""
497+
Create SIMILAR_SUMMARIES ToDos for pairs of advisories from different datasources
498+
whose summaries have a similarity ratio above SUMMARY_SIMILARITY_THRESHOLD.
499+
"""
500+
for advisory_a, advisory_b in combinations(advisories, 2):
501+
if advisory_a.datasource_id == advisory_b.datasource_id:
502+
continue
503+
504+
ratio = difflib.SequenceMatcher(
505+
None, advisory_a.summary, advisory_b.summary
506+
).ratio()
507+
508+
if ratio < SUMMARY_SIMILARITY_THRESHOLD:
509+
continue
510+
511+
pair = [advisory_a, advisory_b]
512+
todo_id = advisories_checksum(pair)
513+
todo = AdvisoryToDoV2(
514+
related_advisories_id=todo_id,
515+
issue_type="SIMILAR_SUMMARIES",
516+
issue_detail=json.dumps(
517+
{
518+
"similarity_score": round(ratio, 4),
519+
"datasource_a": advisory_a.datasource_id,
520+
"datasource_b": advisory_b.datasource_id,
521+
}
522+
),
523+
)
524+
todo_to_create.append(todo)
525+
advisory_relation_to_create[todo_id] = pair

0 commit comments

Comments
 (0)