Skip to content

Commit cb079e2

Browse files
committed
feat: cache csv & json loaded data to avoid I/O at each start / stop task execution
feat: add cache to detect cpu model function
1 parent 4462829 commit cb079e2

File tree

4 files changed

+212
-31
lines changed

4 files changed

+212
-31
lines changed

codecarbon/core/util.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import subprocess
44
import sys
55
from contextlib import contextmanager
6+
from functools import lru_cache
67
from os.path import expandvars
78
from pathlib import Path
89
from typing import Optional, Union
@@ -73,25 +74,26 @@ def backup(file_path: Union[str, Path], ext: Optional[str] = ".bak") -> None:
7374
file_path.rename(backup_path)
7475

7576

76-
def detect_cpu_model() -> str:
77+
@lru_cache(maxsize=1)
78+
def detect_cpu_model() -> Optional[str]:
7779
cpu_info = cpuinfo.get_cpu_info()
7880
if cpu_info:
7981
cpu_model_detected = cpu_info.get("brand_raw", "")
8082
return cpu_model_detected
8183
return None
8284

8385

84-
def is_mac_os() -> str:
86+
def is_mac_os() -> bool:
8587
system = sys.platform.lower()
8688
return system.startswith("dar")
8789

8890

89-
def is_windows_os() -> str:
91+
def is_windows_os() -> bool:
9092
system = sys.platform.lower()
9193
return system.startswith("win")
9294

9395

94-
def is_linux_os() -> str:
96+
def is_linux_os() -> bool:
9597
system = sys.platform.lower()
9698
return system.startswith("lin")
9799

codecarbon/input.py

Lines changed: 80 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
"""
2-
App configuration: This will likely change when we have a common location for data files
2+
App configuration and static reference data loading.
3+
4+
Data files are static reference data that never change during runtime.
5+
They are loaded once at module import to avoid repeated file I/O on the hot path
6+
(start_task/stop_task calls for instance).
37
"""
48

59
import atexit
610
import json
711
import sys
812
from contextlib import ExitStack
9-
from typing import Dict
13+
from typing import Any, Dict
1014

1115
import pandas as pd
1216

@@ -18,6 +22,49 @@
1822
from importlib_resources import files as importlib_resources_files
1923

2024

25+
_CACHE: Dict[str, Any] = {}
26+
_MODULE_NAME = "codecarbon"
27+
28+
29+
def _get_resource_path(filepath: str):
30+
"""Get filesystem path to a package resource file."""
31+
file_manager = ExitStack()
32+
atexit.register(file_manager.close)
33+
ref = importlib_resources_files(_MODULE_NAME).joinpath(filepath)
34+
path = file_manager.enter_context(importlib_resources_as_file(ref))
35+
return path
36+
37+
38+
def _load_static_data() -> None:
39+
"""
40+
Load all static reference data at module import.
41+
42+
Called once when codecarbon is imported. All data loaded here
43+
is immutable and shared across all tracker instances.
44+
"""
45+
# Global energy mix - used for emissions calculations
46+
path = _get_resource_path("data/private_infra/global_energy_mix.json")
47+
with open(path) as f:
48+
_CACHE["global_energy_mix"] = json.load(f)
49+
50+
# Cloud emissions data
51+
path = _get_resource_path("data/cloud/impact.csv")
52+
_CACHE["cloud_emissions"] = pd.read_csv(path)
53+
54+
# Carbon intensity per source
55+
path = _get_resource_path("data/private_infra/carbon_intensity_per_source.json")
56+
with open(path) as f:
57+
_CACHE["carbon_intensity_per_source"] = json.load(f)
58+
59+
# CPU power data
60+
path = _get_resource_path("data/hardware/cpu_power.csv")
61+
_CACHE["cpu_power"] = pd.read_csv(path)
62+
63+
64+
# Load static data at module import
65+
_load_static_data()
66+
67+
2168
class DataSource:
2269
def __init__(self):
2370
self.config = {
@@ -84,56 +131,63 @@ def cpu_power_path(self):
84131

85132
def get_global_energy_mix_data(self) -> Dict:
86133
"""
87-
Returns Global Energy Mix Data
134+
Returns Global Energy Mix Data.
135+
Data is pre-loaded at module import for performance.
88136
"""
89-
with open(self.global_energy_mix_data_path) as f:
90-
global_energy_mix: Dict = json.load(f)
91-
return global_energy_mix
137+
return _CACHE["global_energy_mix"]
92138

93139
def get_cloud_emissions_data(self) -> pd.DataFrame:
94140
"""
95-
Returns Cloud Regions Impact Data
141+
Returns Cloud Regions Impact Data.
142+
Data is pre-loaded at module import for performance.
96143
"""
97-
return pd.read_csv(self.cloud_emissions_path)
144+
return _CACHE["cloud_emissions"]
98145

99146
def get_country_emissions_data(self, country_iso_code: str) -> Dict:
100147
"""
101-
Returns Emissions Across Regions in a country
148+
Returns Emissions Across Regions in a country.
149+
Data is cached on first access per country.
150+
102151
:param country_iso_code: ISO code similar to one used in file names
103152
:return: emissions in lbs/MWh and region code
104153
"""
105-
try:
106-
with open(self.country_emissions_data_path(country_iso_code)) as f:
107-
country_emissions_data: Dict = json.load(f)
108-
return country_emissions_data
109-
except KeyError:
110-
# KeyError raised from line 39, when there is no data path specified for
111-
# the given country
112-
raise DataSourceException
154+
cache_key = f"country_emissions_{country_iso_code}"
155+
if cache_key not in _CACHE:
156+
try:
157+
with open(self.country_emissions_data_path(country_iso_code)) as f:
158+
_CACHE[cache_key] = json.load(f)
159+
except KeyError:
160+
# KeyError raised when there is no data path specified for the country
161+
raise DataSourceException
162+
return _CACHE[cache_key]
113163

114164
def get_country_energy_mix_data(self, country_iso_code: str) -> Dict:
115165
"""
116-
Returns Energy Mix Across Regions in a country
166+
Returns Energy Mix Across Regions in a country.
167+
Data is cached on first access per country.
168+
117169
:param country_iso_code: ISO code similar to one used in file names
118170
:return: energy mix by region code
119171
"""
120-
with open(self.country_energy_mix_data_path(country_iso_code)) as f:
121-
country_energy_mix_data: Dict = json.load(f)
122-
return country_energy_mix_data
172+
cache_key = f"country_energy_mix_{country_iso_code}"
173+
if cache_key not in _CACHE:
174+
with open(self.country_energy_mix_data_path(country_iso_code)) as f:
175+
_CACHE[cache_key] = json.load(f)
176+
return _CACHE[cache_key]
123177

124178
def get_carbon_intensity_per_source_data(self) -> Dict:
125179
"""
126180
Returns Carbon intensity per source. In gCO2.eq/kWh.
181+
Data is pre-loaded at module import for performance.
127182
"""
128-
with open(self.carbon_intensity_per_source_path) as f:
129-
carbon_intensity_per_source: Dict = json.load(f)
130-
return carbon_intensity_per_source
183+
return _CACHE["carbon_intensity_per_source"]
131184

132185
def get_cpu_power_data(self) -> pd.DataFrame:
133186
"""
134-
Returns CPU power Data
187+
Returns CPU power Data.
188+
Data is pre-loaded at module import for performance.
135189
"""
136-
return pd.read_csv(self.cpu_power_path)
190+
return _CACHE["cpu_power"]
137191

138192

139193
class DataSourceException(Exception):

tests/test_core_util.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,34 @@
11
import shutil
22
import tempfile
33

4-
from codecarbon.core.util import backup, resolve_path
4+
from codecarbon.core.util import backup, detect_cpu_model, resolve_path
5+
6+
7+
def test_detect_cpu_model_caching():
8+
"""Test that detect_cpu_model() results are cached."""
9+
# Clear cache to ensure clean state
10+
detect_cpu_model.cache_clear()
11+
12+
# First call should populate cache
13+
result1 = detect_cpu_model()
14+
cache_info1 = detect_cpu_model.cache_info()
15+
assert cache_info1.hits == 0
16+
assert cache_info1.misses == 1
17+
18+
# Second call should hit cache
19+
result2 = detect_cpu_model()
20+
cache_info2 = detect_cpu_model.cache_info()
21+
assert cache_info2.hits == 1
22+
assert cache_info2.misses == 1
23+
24+
# Results should be identical
25+
assert result1 == result2
26+
27+
# Third call should also hit cache
28+
detect_cpu_model()
29+
cache_info3 = detect_cpu_model.cache_info()
30+
assert cache_info3.hits == 2
31+
assert cache_info3.misses == 1
532

633

734
def test_backup():

tests/test_input.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
"""
2+
Tests for codecarbon/input.py module-level caching.
3+
4+
The caching mechanism loads static reference data once at module import
5+
to avoid file I/O on the hot path (start_task/stop_task).
6+
"""
7+
8+
import unittest
9+
10+
11+
class TestDataSourceCaching(unittest.TestCase):
12+
"""Test that DataSource uses module-level cache for static data."""
13+
14+
def test_cache_populated_at_import(self):
15+
"""Verify that _CACHE is populated when module is imported."""
16+
from codecarbon.input import _CACHE
17+
18+
# All static data should be pre-loaded
19+
self.assertIn("global_energy_mix", _CACHE)
20+
self.assertIn("cloud_emissions", _CACHE)
21+
self.assertIn("carbon_intensity_per_source", _CACHE)
22+
self.assertIn("cpu_power", _CACHE)
23+
24+
# Verify data is non-empty
25+
self.assertGreater(len(_CACHE["global_energy_mix"]), 0)
26+
self.assertGreater(len(_CACHE["cloud_emissions"]), 0)
27+
self.assertGreater(len(_CACHE["carbon_intensity_per_source"]), 0)
28+
self.assertGreater(len(_CACHE["cpu_power"]), 0)
29+
30+
def test_get_global_energy_mix_returns_cached_data(self):
31+
"""Verify get_global_energy_mix_data() returns cached object."""
32+
from codecarbon.input import _CACHE, DataSource
33+
34+
ds = DataSource()
35+
data = ds.get_global_energy_mix_data()
36+
37+
# Should return the exact same object from cache
38+
self.assertIs(data, _CACHE["global_energy_mix"])
39+
40+
def test_get_cloud_emissions_returns_cached_data(self):
41+
"""Verify get_cloud_emissions_data() returns cached object."""
42+
from codecarbon.input import _CACHE, DataSource
43+
44+
ds = DataSource()
45+
data = ds.get_cloud_emissions_data()
46+
47+
# Should return the exact same object from cache
48+
self.assertIs(data, _CACHE["cloud_emissions"])
49+
50+
def test_get_carbon_intensity_returns_cached_data(self):
51+
"""Verify get_carbon_intensity_per_source_data() returns cached object."""
52+
from codecarbon.input import _CACHE, DataSource
53+
54+
ds = DataSource()
55+
data = ds.get_carbon_intensity_per_source_data()
56+
57+
# Should return the exact same object from cache
58+
self.assertIs(data, _CACHE["carbon_intensity_per_source"])
59+
60+
def test_get_cpu_power_returns_cached_data(self):
61+
"""Verify get_cpu_power_data() returns cached object."""
62+
from codecarbon.input import _CACHE, DataSource
63+
64+
ds = DataSource()
65+
data = ds.get_cpu_power_data()
66+
67+
# Should return the exact same object from cache
68+
self.assertIs(data, _CACHE["cpu_power"])
69+
70+
def test_country_data_lazy_loaded(self):
71+
"""Verify country-specific data is lazy-loaded and cached."""
72+
from codecarbon.input import _CACHE, DataSource
73+
74+
ds = DataSource()
75+
cache_key = "country_emissions_usa"
76+
77+
# USA data may or may not be cached depending on prior test runs
78+
# Just verify that after calling, it IS cached
79+
data = ds.get_country_emissions_data("usa")
80+
self.assertIn(cache_key, _CACHE)
81+
self.assertIs(data, _CACHE[cache_key])
82+
83+
def test_multiple_datasource_instances_share_cache(self):
84+
"""Verify that multiple DataSource instances share the same cache."""
85+
from codecarbon.input import DataSource
86+
87+
ds1 = DataSource()
88+
ds2 = DataSource()
89+
90+
# Both instances should return the same cached object
91+
data1 = ds1.get_global_energy_mix_data()
92+
data2 = ds2.get_global_energy_mix_data()
93+
94+
self.assertIs(data1, data2)
95+
96+
97+
if __name__ == "__main__":
98+
unittest.main()

0 commit comments

Comments
 (0)