Skip to content

Commit 9e3ab90

Browse files
authored
Drop hardcoded Nebius InfiniBand fabrics (#3234)
Use the fabrics from gpuhunt offers instead. The list of fabrics was moved from dstack to gpuhunt so that new fabrics can be added without a dstack release.
1 parent 8e6aec8 commit 9e3ab90

File tree

6 files changed

+101
-89
lines changed

6 files changed

+101
-89
lines changed

src/dstack/_internal/core/backends/nebius/compute.py

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,11 @@
2828
get_offers_disk_modifier,
2929
)
3030
from dstack._internal.core.backends.nebius import resources
31-
from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
32-
from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
31+
from dstack._internal.core.backends.nebius.models import (
32+
NebiusConfig,
33+
NebiusOfferBackendData,
34+
NebiusServiceAccountCreds,
35+
)
3336
from dstack._internal.core.errors import (
3437
BackendError,
3538
NotYetTerminated,
@@ -281,23 +284,30 @@ def create_placement_group(
281284
master_instance_offer: InstanceOffer,
282285
) -> PlacementGroupProvisioningData:
283286
assert placement_group.configuration.placement_strategy == PlacementStrategy.CLUSTER
284-
backend_data = NebiusPlacementGroupBackendData(cluster=None)
287+
master_instance_offer_backend_data: NebiusOfferBackendData = (
288+
NebiusOfferBackendData.__response__.parse_obj(master_instance_offer.backend_data)
289+
)
290+
fabrics = list(master_instance_offer_backend_data.fabrics)
291+
if self.config.fabrics is not None:
292+
fabrics = [f for f in fabrics if f in self.config.fabrics]
293+
placement_group_backend_data = NebiusPlacementGroupBackendData(cluster=None)
285294
# Only create a Nebius cluster if the instance supports it.
286295
# For other instances, return dummy PlacementGroupProvisioningData.
287-
if fabrics := get_suitable_infiniband_fabrics(
288-
master_instance_offer, allowed_fabrics=self.config.fabrics
289-
):
296+
if fabrics:
290297
fabric = random.choice(fabrics)
291298
op = resources.create_cluster(
292299
self._sdk,
293300
name=placement_group.name,
294301
project_id=self._region_to_project_id[placement_group.configuration.region],
295302
fabric=fabric,
296303
)
297-
backend_data.cluster = NebiusClusterBackendData(id=op.resource_id, fabric=fabric)
304+
placement_group_backend_data.cluster = NebiusClusterBackendData(
305+
id=op.resource_id,
306+
fabric=fabric,
307+
)
298308
return PlacementGroupProvisioningData(
299309
backend=BackendType.NEBIUS,
300-
backend_data=backend_data.json(),
310+
backend_data=placement_group_backend_data.json(),
301311
)
302312

303313
def delete_placement_group(self, placement_group: PlacementGroup) -> None:
@@ -317,16 +327,15 @@ def is_suitable_placement_group(
317327
if placement_group.configuration.region != instance_offer.region:
318328
return False
319329
assert placement_group.provisioning_data is not None
320-
backend_data = NebiusPlacementGroupBackendData.load(
330+
placement_group_backend_data = NebiusPlacementGroupBackendData.load(
321331
placement_group.provisioning_data.backend_data
322332
)
333+
instance_offer_backend_data: NebiusOfferBackendData = (
334+
NebiusOfferBackendData.__response__.parse_obj(instance_offer.backend_data)
335+
)
323336
return (
324-
backend_data.cluster is None
325-
or backend_data.cluster.fabric
326-
in get_suitable_infiniband_fabrics(
327-
instance_offer,
328-
allowed_fabrics=None, # enforced at cluster creation time, no need to enforce here
329-
)
337+
placement_group_backend_data.cluster is None
338+
or placement_group_backend_data.cluster.fabric in instance_offer_backend_data.fabrics
330339
)
331340

332341

src/dstack/_internal/core/backends/nebius/configurator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
)
1111
from dstack._internal.core.backends.nebius import resources
1212
from dstack._internal.core.backends.nebius.backend import NebiusBackend
13-
from dstack._internal.core.backends.nebius.fabrics import get_all_infiniband_fabrics
1413
from dstack._internal.core.backends.nebius.models import (
1514
NebiusBackendConfig,
1615
NebiusBackendConfigWithCreds,
@@ -19,6 +18,7 @@
1918
NebiusServiceAccountCreds,
2019
NebiusStoredConfig,
2120
)
21+
from dstack._internal.core.backends.nebius.resources import get_all_infiniband_fabrics
2222
from dstack._internal.core.errors import BackendError, ServerClientError
2323
from dstack._internal.core.models.backends.base import BackendType
2424

src/dstack/_internal/core/backends/nebius/fabrics.py

Lines changed: 0 additions & 49 deletions
This file was deleted.

src/dstack/_internal/core/backends/nebius/models.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,3 +179,7 @@ class NebiusConfig(NebiusStoredConfig):
179179
"""
180180

181181
creds: AnyNebiusCreds
182+
183+
184+
class NebiusOfferBackendData(CoreModel):
185+
fabrics: set[str] = set()

src/dstack/_internal/core/backends/nebius/resources.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,14 @@
4949
from nebius.sdk import SDK
5050

5151
from dstack._internal.core.backends.base.configurator import raise_invalid_credentials_error
52+
from dstack._internal.core.backends.base.offers import get_catalog_offers
5253
from dstack._internal.core.backends.nebius.models import (
5354
DEFAULT_PROJECT_NAME_PREFIX,
55+
NebiusOfferBackendData,
5456
NebiusServiceAccountCreds,
5557
)
5658
from dstack._internal.core.errors import BackendError, NoCapacityError
59+
from dstack._internal.core.models.backends.base import BackendType
5760
from dstack._internal.utils.event_loop import DaemonEventLoop
5861
from dstack._internal.utils.logging import get_logger
5962

@@ -250,6 +253,17 @@ def get_default_subnet(sdk: SDK, project_id: str) -> Subnet:
250253
raise BackendError(f"Could not find default subnet in project {project_id}")
251254

252255

256+
def get_all_infiniband_fabrics() -> set[str]:
257+
offers = get_catalog_offers(backend=BackendType.NEBIUS)
258+
result = set()
259+
for offer in offers:
260+
backend_data: NebiusOfferBackendData = NebiusOfferBackendData.__response__.parse_obj(
261+
offer.backend_data
262+
)
263+
result |= backend_data.fabrics
264+
return result
265+
266+
253267
def create_disk(
254268
sdk: SDK, name: str, project_id: str, size_mib: int, image_family: str, labels: Dict[str, str]
255269
) -> SDKOperation[Operation]:

src/tests/_internal/server/routers/test_backends.py

Lines changed: 58 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,13 @@
1313

1414
from dstack._internal.core.backends.oci import region as oci_region
1515
from dstack._internal.core.models.backends.base import BackendType
16-
from dstack._internal.core.models.instances import InstanceStatus
16+
from dstack._internal.core.models.instances import (
17+
Gpu,
18+
InstanceOffer,
19+
InstanceStatus,
20+
InstanceType,
21+
Resources,
22+
)
1723
from dstack._internal.core.models.users import GlobalRole, ProjectRole
1824
from dstack._internal.core.models.volumes import VolumeStatus
1925
from dstack._internal.server.models import BackendModel
@@ -212,6 +218,30 @@ async def test_creates_lambda_backend(
212218
@pytest.mark.skipif(sys.version_info < (3, 10), reason="Nebius requires Python 3.10")
213219
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
214220
class TestNebius:
221+
@pytest.fixture(autouse=True)
222+
def patch_catalog(self):
223+
with patch(
224+
"dstack._internal.core.backends.nebius.resources.get_catalog_offers"
225+
) as get_catalog_offers_mock:
226+
get_catalog_offers_mock.return_value = [
227+
InstanceOffer(
228+
backend=BackendType.NEBIUS,
229+
instance=InstanceType(
230+
name="gpu-h100-sxm 8gpu-128vcpu-1600gb",
231+
resources=Resources(
232+
cpus=128,
233+
memory_mib=1600 * 1024,
234+
gpus=[Gpu(name="H100", memory_mib=80 * 1024)] * 8,
235+
spot=False,
236+
),
237+
),
238+
region="eu-north1",
239+
price=23.6,
240+
backend_data={"fabrics": ["fabric-2", "fabric-3"]},
241+
)
242+
]
243+
yield
244+
215245
async def test_not_creates_with_invalid_creds(
216246
self, test_db, session: AsyncSession, client: AsyncClient
217247
):
@@ -238,18 +268,16 @@ async def test_not_creates_with_invalid_creds(
238268
assert len(res.scalars().all()) == 0
239269

240270
@pytest.mark.parametrize(
241-
("config_regions", "config_projects", "mocked_projects", "error"),
271+
("config_extra", "mocked_projects", "error"),
242272
[
243273
pytest.param(
244-
None,
245-
None,
274+
{},
246275
[_nebius_project()],
247276
None,
248277
id="default",
249278
),
250279
pytest.param(
251-
["eu-north1"],
252-
None,
280+
{"regions": ["eu-north1"]},
253281
[
254282
_nebius_project(
255283
"project-e00test", "default-project-eu-north1", "eu-north1"
@@ -260,15 +288,13 @@ async def test_not_creates_with_invalid_creds(
260288
id="with-regions",
261289
),
262290
pytest.param(
263-
["xx-xxxx1"],
264-
None,
291+
{"regions": ["xx-xxxx1"]},
265292
[_nebius_project()],
266293
"do not exist in this Nebius tenancy",
267294
id="error-invalid-regions",
268295
),
269296
pytest.param(
270-
["eu-north1"],
271-
None,
297+
{"regions": ["eu-north1"]},
272298
[
273299
_nebius_project(
274300
"project-e00test0", "default-project-eu-north1", "eu-north1"
@@ -279,8 +305,7 @@ async def test_not_creates_with_invalid_creds(
279305
id="finds-default-project-among-many",
280306
),
281307
pytest.param(
282-
["eu-north1"],
283-
None,
308+
{"regions": ["eu-north1"]},
284309
[
285310
_nebius_project("project-e00test0", "non-default-project-0", "eu-north1"),
286311
_nebius_project("project-e00test1", "non-default-project-1", "eu-north1"),
@@ -289,8 +314,7 @@ async def test_not_creates_with_invalid_creds(
289314
id="error-no-default-project",
290315
),
291316
pytest.param(
292-
None,
293-
["project-e00test0"],
317+
{"projects": ["project-e00test0"]},
294318
[
295319
_nebius_project("project-e00test0", "non-default-project-0", "eu-north1"),
296320
_nebius_project("project-e00test1", "non-default-project-1", "eu-north1"),
@@ -299,15 +323,13 @@ async def test_not_creates_with_invalid_creds(
299323
id="with-projects",
300324
),
301325
pytest.param(
302-
None,
303-
["project-e00xxxx"],
326+
{"projects": ["project-e00xxxx"]},
304327
[_nebius_project()],
305328
"not found in this Nebius tenancy",
306329
id="error-invalid-projects",
307330
),
308331
pytest.param(
309-
None,
310-
["project-e00test0", "project-e00test1"],
332+
{"projects": ["project-e00test0", "project-e00test1"]},
311333
[
312334
_nebius_project("project-e00test0", "non-default-project-0", "eu-north1"),
313335
_nebius_project("project-e00test1", "non-default-project-1", "eu-north1"),
@@ -316,8 +338,10 @@ async def test_not_creates_with_invalid_creds(
316338
id="error-multiple-projects-in-same-region",
317339
),
318340
pytest.param(
319-
["eu-north1"],
320-
["project-e00test"],
341+
{
342+
"regions": ["eu-north1"],
343+
"projects": ["project-e00test"],
344+
},
321345
[
322346
_nebius_project(
323347
"project-e00test", "default-project-eu-north1", "eu-north1"
@@ -327,15 +351,26 @@ async def test_not_creates_with_invalid_creds(
327351
None,
328352
id="with-regions-and-projects",
329353
),
354+
pytest.param(
355+
{"fabrics": ["fabric-2", "fabric-3"]},
356+
[_nebius_project()],
357+
None,
358+
id="with-valid-fabrics",
359+
),
360+
pytest.param(
361+
{"fabrics": ["fabric-2", "fabric-invalid"]},
362+
[_nebius_project()],
363+
"InfiniBand fabrics do not exist",
364+
id="with-invalid-fabrics",
365+
),
330366
],
331367
)
332368
async def test_create(
333369
self,
334370
test_db,
335371
session: AsyncSession,
336372
client: AsyncClient,
337-
config_regions: Optional[list[str]],
338-
config_projects: Optional[list[str]],
373+
config_extra: dict[str, Any],
339374
mocked_projects: Sequence[Any],
340375
error: Optional[str],
341376
):
@@ -347,8 +382,7 @@ async def test_create(
347382
body = {
348383
"type": "nebius",
349384
"creds": FAKE_NEBIUS_SERVICE_ACCOUNT_CREDS,
350-
"regions": config_regions,
351-
"projects": config_projects,
385+
**config_extra,
352386
}
353387
with patch(
354388
"dstack._internal.core.backends.nebius.resources.list_tenant_projects"

0 commit comments

Comments
 (0)