Skip to content

Commit 046d5fd

Browse files
authored
llama: use host memory if device reports 0 memory (ggml-org#18587)
1 parent 480160d commit 046d5fd

File tree

4 files changed

+28
-8
lines changed

4 files changed

+28
-8
lines changed

ggml/src/ggml-backend-impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ extern "C" {
144144
// device description: short informative description of the device, could be the model name
145145
const char * (*get_description)(ggml_backend_dev_t dev);
146146

147-
// device memory in bytes
147+
// device memory in bytes: 0 bytes to indicate no memory to report
148148
void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
149149

150150
// device type

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4287,8 +4287,8 @@ static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_
42874287
}
42884288

42894289
static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
4290-
*free = 1;
4291-
*total = 1;
4290+
*free = 0;
4291+
*total = 0;
42924292

42934293
GGML_UNUSED(dev);
42944294
}

src/llama-model.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2452,6 +2452,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
24522452
pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
24532453
}
24542454

2455+
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2456+
if (cpu_dev == nullptr) {
2457+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
2458+
}
2459+
24552460
// calculate the split points
24562461
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
24572462
std::vector<float> splits(n_devices());
@@ -2462,6 +2467,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
24622467
size_t total;
24632468
size_t free;
24642469
ggml_backend_dev_memory(dev, &free, &total);
2470+
2471+
// devices can return 0 bytes for free and total memory if they do not
2472+
// have any to report. in this case, we will use the host memory as a fallback
2473+
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
2474+
if (free == 0 && total == 0) {
2475+
ggml_backend_dev_memory(cpu_dev, &free, &total);
2476+
}
24652477
splits[i] = free;
24662478
}
24672479
} else {
@@ -2478,10 +2490,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
24782490
splits[i] /= split_sum;
24792491
}
24802492

2481-
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2482-
if (cpu_dev == nullptr) {
2483-
throw std::runtime_error(format("%s: no CPU backend found", __func__));
2484-
}
24852493
const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
24862494
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
24872495
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {

src/llama.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,20 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
111111
}
112112
}
113113
for (size_t i = 0; i < ret.size(); i++) {
114-
size_t free, total;
114+
size_t free;
115+
size_t total;
115116
ggml_backend_dev_memory(model->devices[i], &free, &total);
117+
118+
// devices can return 0 bytes for free and total memory if they do not
119+
// have any to report. in this case, we will use the host memory as a fallback
120+
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
121+
if (free == 0 && total == 0) {
122+
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
123+
if (cpu_dev == nullptr) {
124+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
125+
}
126+
ggml_backend_dev_memory(cpu_dev, &free, &total);
127+
}
116128
ret[i].free = free;
117129
ret[i].total = total;
118130
}

0 commit comments

Comments
 (0)