Skip to content

Commit 80940ca

Browse files
Fix #2122: Standardize User-Agent headers across Importers to prevent blocking
1 parent b5a4445 commit 80940ca

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+381
-67
lines changed

aboutcode/federated/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
from packageurl import normalize_subpath
2727
from packageurl import normalize_version
2828

29+
from django.conf import settings
30+
2931
__version__ = "0.1.0"
3032

3133
"""
@@ -559,7 +561,7 @@ def from_url(
559561
federation_name=name,
560562
config_filename=cls.CONFIG_FILENAME,
561563
)
562-
headers = {"User-Agent": "AboutCode/FederatedCode"}
564+
headers = {"User-Agent": settings.VC_USER_AGENT}
563565
response = requests.get(url=rcf_url, headers=headers)
564566
if not response.ok:
565567
raise Exception(f"Failed to fetch Federation config: {rcf_url}")

vulnerabilities/importers/apache_httpd.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from vulnerabilities.utils import create_weaknesses_list
2828
from vulnerabilities.utils import cwe_regex
2929
from vulnerabilities.utils import get_item
30+
from django.conf import settings
3031

3132
logger = logging.getLogger(__name__)
3233

@@ -41,7 +42,10 @@ class ApacheHTTPDImporter(Importer):
4142
def advisory_data(self):
4243
links = fetch_links(self.base_url)
4344
for link in links:
44-
data = requests.get(link).json()
45+
data = requests.get(
46+
link,
47+
headers={'User-Agent': settings.VC_USER_AGENT}
48+
).json()
4549
yield self.to_advisory(data)
4650

4751
def to_advisory(self, data):
@@ -150,7 +154,10 @@ def to_version_ranges(self, versions_data, fixed_versions):
150154

151155
def fetch_links(url):
152156
links = []
153-
data = requests.get(url).content
157+
data = requests.get(
158+
url,
159+
headers={'User-Agent': settings.VC_USER_AGENT}
160+
).content
154161
soup = BeautifulSoup(data, features="lxml")
155162
for tag in soup.find_all("a"):
156163
link = tag.get("href")

vulnerabilities/importers/apache_kafka.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from vulnerabilities.importer import AffectedPackage
2121
from vulnerabilities.importer import Importer
2222
from vulnerabilities.importer import Reference
23+
from django.conf import settings
2324

2425
logger = logging.getLogger(__name__)
2526

@@ -99,7 +100,10 @@ class ApacheKafkaImporter(Importer):
99100

100101
@staticmethod
101102
def fetch_advisory_page(self):
102-
page = requests.get(self.GH_PAGE_URL)
103+
page = requests.get(
104+
self.GH_PAGE_URL,
105+
headers={'User-Agent': settings.VC_USER_AGENT}
106+
)
103107
return page.content
104108

105109
def advisory_data(self):

vulnerabilities/importers/apache_tomcat.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from vulnerabilities.importer import Reference
2828
from vulnerabilities.importer import VulnerabilitySeverity
2929
from vulnerabilities.severity_systems import APACHE_TOMCAT
30+
from django.conf import settings
3031

3132
LOGGER = logging.getLogger(__name__)
3233

@@ -126,15 +127,21 @@ def fetch_advisory_pages(self):
126127
"""
127128
links = self.fetch_advisory_links("https://tomcat.apache.org/security")
128129
for page_url in links:
129-
yield page_url, requests.get(page_url).content
130+
yield page_url, requests.get(
131+
page_url,
132+
headers={'User-Agent': settings.VC_USER_AGENT}
133+
).content
130134

131135
def fetch_advisory_links(self, url):
132136
"""
133137
Yield the URLs of each Tomcat version security-related page.
134138
Each page link is in the form of `https://tomcat.apache.org/security-10.html`,
135139
for instance, for v10.
136140
"""
137-
data = requests.get(url).content
141+
data = requests.get(
142+
url,
143+
headers={'User-Agent': settings.VC_USER_AGENT}
144+
).content
138145
soup = BeautifulSoup(data, features="lxml")
139146
for tag in soup.find_all("a"):
140147
link = tag.get("href")

vulnerabilities/importers/debian.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from vulnerabilities.utils import create_weaknesses_list
2828
from vulnerabilities.utils import dedupe
2929
from vulnerabilities.utils import get_item
30+
from django.conf import settings
3031

3132
logger = logging.getLogger(__name__)
3233

@@ -83,7 +84,10 @@ class DebianImporter(Importer):
8384
importer_name = "Debian Importer"
8485

8586
def get_response(self):
86-
response = requests.get(self.api_url)
87+
response = requests.get(
88+
self.api_url,
89+
headers={'User-Agent': settings.VC_USER_AGENT}
90+
)
8791
if response.status_code == 200:
8892
return response.json()
8993
raise Exception(

vulnerabilities/importers/debian_oval.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import requests
1515

1616
from vulnerabilities.importer import OvalImporter
17-
17+
from django.conf import settings
1818

1919
class DebianOvalImporter(OvalImporter):
2020

@@ -68,7 +68,10 @@ def _fetch(self):
6868
for release in releases:
6969
file_url = f"https://www.debian.org/security/oval/oval-definitions-{release}.xml.bz2"
7070
self.data_url = file_url
71-
resp = requests.get(file_url).content
71+
resp = requests.get(
72+
file_url,
73+
headers={'User-Agent': settings.VC_USER_AGENT}
74+
).content
7275
extracted = bz2.decompress(resp)
7376
yield (
7477
{"type": "deb", "namespace": "debian", "qualifiers": {"distro": release}},

vulnerabilities/importers/gsd.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from vulnerabilities.importer import Reference
2323
from vulnerabilities.utils import build_description
2424
from vulnerabilities.utils import dedupe
25+
from django.conf import settings
2526

2627
logger = logging.getLogger(__name__)
2728

@@ -32,7 +33,10 @@ class GSDImporter: # TODO inherit from Importer
3233
url = "https://codeload.github.com/cloudsecurityalliance/gsd-database/zip/refs/heads/main"
3334

3435
def advisory_data(self) -> Iterable[AdvisoryData]:
35-
response = requests.get(self.url).content
36+
response = requests.get(
37+
self.url,
38+
headers={'User-Agent': settings.VC_USER_AGENT}
39+
).content
3640
with ZipFile(BytesIO(response)) as zip_file:
3741
for file_name in zip_file.namelist():
3842
if file_name == "gsd-database-main/allowlist.json" or not file_name.endswith(

vulnerabilities/importers/mattermost.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from vulnerabilities.importer import Reference
2626
from vulnerabilities.importer import VulnerabilitySeverity
2727
from vulnerabilities.package_managers import GitHubTagsAPI
28+
from django.conf import settings
2829

2930
SECURITY_UPDATES_URL = "https://mattermost.com/security-updates"
3031
MM_REPO = {
@@ -36,13 +37,13 @@
3637

3738
class MattermostDataSource(Importer):
3839
def updated_advisories(self):
39-
# FIXME: Change after this https://forum.mattermost.org/t/mattermost-website-returning-403-when-headers-contain-the-word-python/11412
4040
self.set_api()
4141
data = requests.get(
42-
SECURITY_UPDATES_URL, headers={"user-agent": "aboutcode/vulnerablecode"}
42+
SECURITY_UPDATES_URL,
43+
headers={"User-Agent": settings.VC_USER_AGENT},
4344
).content
4445
return self.batch_advisories(self.to_advisories(data))
45-
46+
4647
def set_api(self):
4748
self.version_api = GitHubTagsAPI()
4849
asyncio.run(

vulnerabilities/importers/openssl.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from vulnerabilities.importer import Reference
2626
from vulnerabilities.importer import VulnerabilitySeverity
2727
from vulnerabilities.severity_systems import SCORING_SYSTEMS
28+
from django.conf import settings
2829

2930
logger = logging.getLogger(__name__)
3031

@@ -36,7 +37,10 @@ class OpensslImporter(Importer):
3637
importer_name = "OpenSSL Importer"
3738

3839
def fetch(self):
39-
response = requests.get(url=self.url)
40+
response = requests.get(
41+
url=self.url,
42+
headers={'User-Agent': settings.VC_USER_AGENT}
43+
)
4044
if not response.status_code == 200:
4145
logger.error(f"Error while fetching {self.url}: {response.status_code}")
4246
return

vulnerabilities/importers/postgresql.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from vulnerabilities.importer import Importer
2222
from vulnerabilities.importer import Reference
2323
from vulnerabilities.importer import VulnerabilitySeverity
24-
24+
from django.conf import settings
2525

2626
class PostgreSQLImporter(Importer):
2727

@@ -37,7 +37,10 @@ def advisory_data(self):
3737
while True:
3838
unvisited_urls = known_urls - visited_urls
3939
for url in unvisited_urls:
40-
data = requests.get(url).content
40+
data = requests.get(
41+
url,
42+
headers={'User-Agent': settings.VC_USER_AGENT}
43+
).content
4144
data_by_url[url] = data
4245
visited_urls.add(url)
4346
known_urls.update(find_advisory_urls(data))

0 commit comments

Comments
 (0)