-
Couldn't load subscription status.
- Fork 842
Description
Hi ATS team,
We encountered an issue where a core dump occurs due to repeated calls to CacheVC::scanObject after a failed cache deletion.
I’d like to share our findings and ask for your insights on this behavior.
Issue Description
We implemented a test function to verify the cache deletion process.
This function directly calls cacheProcessor.scan() to check whether cached objects are properly removed.
During testing, we observed that when cacheProcessor.scan() is executed,
a series of unexpected behaviors occur — leading to repeated calls to CacheVC::scanObject and eventually a core dump.
After calling CacheVC::scanObject, the function attempts to delete the corresponding cache entry.
However, the cache is not actually deleted, and CacheVC::scanObject is called again, repeatedly attempting to delete the same file in an infinite loop.
In conclusion, this happens because the stripe value becomes inconsistent (different) between operations.
Our test function calls cacheProcessor.scan(). Details are described below.
cacheProcessor.scan()
=> caches[CACHE_FRAG_TYPE_HTTP]->scan()
=> CacheVC::scanStripe()
=> CacheVC::scanObject()
For debugging, we inserted debug messages into the following functions:
int
dir_probe(const CacheKey *key, Stripe *stripe, Dir *result, Dir **last_collision)
{
ink_assert(stripe->mutex->thread_holding == this_ethread());
int s = key->slice32(0) % stripe->segments;
int b = key->slice32(1) % stripe->buckets;
Dir *seg = stripe->dir_segment(s);
Dir *e = nullptr, *p = nullptr, *collision = *last_collision;
CHECK_DIR(d);
#ifdef LOOP_CHECK_MODE
if (dir_bucket_loop_fix(dir_bucket(b, seg), s, vol))
return 0;
#endif
Lagain:
e = dir_bucket(b, seg);
// Added for debugging (+)
DDbg(dbg_ctl_cache_stats, "==> stripe = %p, s = %d, b = %d, seg = %p, e = %p", stripe, s, b, seg, e);
if (stripe) {
DDbg(dbg_ctl_cache_stats, "stripe->fd = %d", stripe->fd);
}
if (e) {
DDbg(dbg_ctl_cache_stats, "boffset %" PRId64 "", dir_offset(e));
}
// Added for debugging (-)
...
Scenario
- During a test run,
cacheProcessor.scan()was manually invoked to verify cache deletion.
int
CacheVC::scanObject(int /* event ATS_UNUSED */, Event * /* e ATS_UNUSED */)
{
Dbg(dbg_ctl_cache_scan_truss, "inside %p:scanObject", this);
Doc *doc = nullptr;
void *result = nullptr;
int hlen = 0;
char hname[500];
bool hostinfo_copied = false;
off_t next_object_len = 0;
bool might_need_overlap_read = false;
...
while (static_cast<off_t>(reinterpret_cast<char *>(doc) - buf->data()) + next_object_len <
static_cast<off_t>(io.aiocb.aio_nbytes)) {
might_need_overlap_read = false;
doc = reinterpret_cast<Doc *>(reinterpret_cast<char *>(doc) + next_object_len);
next_object_len = stripe->round_to_approx_size(doc->len);
int i;
bool changed;
...
while (true) {
// dir_probe debug info. stripe is 0x7ff5a0057030
// ===========================================================================================================
/*
[Oct 15 13:06:30.191] [ET_NET 5] DIAG: <CacheDir.cc:559 (dir_probe)> (cache_stats) ==> stripe = 0x7ff5a0057030, s = 46, b = 5309, seg = 0x7ff59b852430, e = 0x7ff59b8861b8
[Oct 15 13:06:30.191] [ET_NET 5] DIAG: <CacheDir.cc:561 (dir_probe)> (cache_stats) stripe->fd = 32
[Oct 15 13:06:30.191] [ET_NET 5] DIAG: <CacheDir.cc:564 (dir_probe)> (cache_stats) boffset 244180
[Oct 15 13:06:30.191] [ET_NET 5] DIAG: <CacheDir.cc:591 (dir_probe)> (dir_probe_hit) found (key=3447329293) D829FB03 DD1078BE vol 32 bucket 5309 boffset 244180
*/
// ===========================================================================================================
if (!dir_probe(&doc->first_key, stripe, &dir, &last_collision)) {
goto Lskip;
}
if (!dir_agg_valid(stripe, &dir) || !dir_head(&dir) ||
(stripe->vol_offset(&dir) != io.aiocb.aio_offset + (reinterpret_cast<char *>(doc) - buf->data()))) {
continue;
}
// Break occurs here and exits the loop.
break;
}
...
// vector.count() is 1.
changed = false;
hostinfo_copied = false;
for (i = 0; i < vector.count(); i++) {
if (!vector.get(i)->valid()) {
goto Lskip;
}
if (!hostinfo_copied) {
memccpy(hname, vector.get(i)->request_get()->host_get(&hlen), 0, 500);
hname[hlen] = 0;
Dbg(dbg_ctl_cache_scan, "hostname = '%s', hostlen = %d", hname, hlen);
// [Oct 15 13:06:30.191] [ET_NET 5] DIAG: <CacheVC.cc:940 (scanObject)> (cache_scan) hostname = 'test.aplatform.com', hostlen = 18
hostinfo_copied = true;
}
...
earliest_key = key;
int result1 = _action.continuation->handleEvent(CACHE_EVENT_SCAN_OBJECT, vector.get(i));
/*
// We simply check whether the object should be deleted during the scan test.
// If it matches, return CACHE_SCAN_RESULT_DELETE.
// url info: path [image/1B]
*/
switch (result1) {
case CACHE_SCAN_RESULT_CONTINUE:
continue;
case CACHE_SCAN_RESULT_DELETE: // Execution enters here.
changed = true;
vector.remove(i, true);
i--;
continue;
...
}
if (changed) {
if (!vector.count()) {
ink_assert(hostinfo_copied);
SET_HANDLER(&CacheVC::scanRemoveDone);
// force remove even if there is a writer
cacheProcessor.remove(this, &doc->first_key, CACHE_FRAG_TYPE_HTTP, hname, hlen); // Execution enters here.
return EVENT_CONT;
} else {
...
}
}
- Starting from the call to
cacheProcessor.remove(this, &doc->first_key, CACHE_FRAG_TYPE_HTTP, hname, hlen), the following behavior occurs...
Action *
CacheProcessor::remove(Continuation *cont, const CacheKey *key, CacheFragType frag_type, const char *hostname, int host_len)
{
Dbg(dbg_ctl_cache_remove, "[CacheProcessor::remove] Issuing cache delete for %u", cache_hash(*key));
// [Oct 15 13:06:30.192] [ET_NET 5] DIAG: <Cache.cc:854 (remove)> (cache_remove) [CacheProcessor::remove] Issuing cache delete for 3447329293
return caches[frag_type]->remove(cont, key, frag_type, hostname, host_len);
}
Action *
Cache::remove(Continuation *cont, const CacheKey *key, CacheFragType type, const char *hostname, int host_len)
{
if (!CacheProcessor::IsCacheReady(type)) {
if (cont) {
cont->handleEvent(CACHE_EVENT_REMOVE_FAILED, nullptr);
}
return ACTION_RESULT_DONE;
}
Ptr<ProxyMutex> mutex;
if (!cont) {
cont = new_CacheRemoveCont();
}
CACHE_TRY_LOCK(lock, cont->mutex, this_ethread());
ink_assert(lock.is_locked());
// ===========================================================================================================
// This is where the problem occurs.
// The stripe pointer is returned as 0x7ff5a0071030 instead of 0x7ff5a0057030.
// ===========================================================================================================
Stripe *stripe = key_to_stripe(key, hostname, host_len);
/*
[Oct 15 13:06:30.192] [ET_NET 5] DIAG: <Cache.cc:1880 (key_to_stripe)> (cache_hosting) Generic volume: b8028a88d for host: test.aplatform.com
*/
// coverity[var_decl]
Dir result;
dir_clear(&result); // initialized here, set result empty so we can recognize missed lock
mutex = cont->mutex;
CacheVC *c = new_CacheVC(cont);
c->vio.op = VIO::NONE;
c->frag_type = type;
c->op_type = static_cast<int>(CacheOpType::Remove);
Metrics::Gauge::increment(cache_rsb.status[c->op_type].active);
Metrics::Gauge::increment(stripe->cache_vol->vol_rsb.status[c->op_type].active);
c->first_key = c->key = *key;
c->stripe = stripe; // This is the problematic section.
c->dir = result;
c->f.remove = 1;
SET_CONTINUATION_HANDLER(c, &CacheVC::removeEvent);
int ret = c->removeEvent(EVENT_IMMEDIATE, nullptr); // This function is invoked.
if (ret == EVENT_DONE) {
return ACTION_RESULT_DONE;
} else {
return &c->_action;
}
}
- After calling
c->removeEvent(EVENT_IMMEDIATE, nullptr), the following behavior occurs…
int
CacheVC::removeEvent(int /* event ATS_UNUSED */, Event * /* e ATS_UNUSED */)
{
...
Lcollision:
// check for collision
// In the normal case, execution should enter the if-statement.
// However, the stripe information has changed,
// so the code proceeds to the next routine after the if-statement.
/*
[Oct 15 13:06:30.192] [ET_NET 5] DIAG: <CacheDir.cc:559 (dir_probe)> (cache_stats) ==> stripe = 0x7ff5a0071030, s = 46, b = 5309, seg = 0x7ff596890430, e = 0x7ff5968c41b8
[Oct 15 13:06:30.192] [ET_NET 5] DIAG: <CacheDir.cc:561 (dir_probe)> (cache_stats) stripe->fd = 32
[Oct 15 13:06:30.192] [ET_NET 5] DIAG: <CacheDir.cc:564 (dir_probe)> (cache_stats) boffset 0
[Oct 15 13:06:30.192] [ET_NET 5] DIAG: <CacheDir.cc:621 (dir_probe)> (dir_probe_miss) missed (key=3447329293) D829FB03 DD1078BE on vol 32 bucket 5309 at 0x7ff596890430
*/
if (dir_probe(&key, stripe, &dir, &last_collision) > 0) {
int ret = do_read_call(&key);
if (ret == EVENT_RETURN) {
goto Lread;
}
return ret;
}
Ldone:
Metrics::Counter::increment(cache_rsb.status[static_cast<int>(CacheOpType::Remove)].failure);
Metrics::Counter::increment(stripe->cache_vol->vol_rsb.status[static_cast<int>(CacheOpType::Remove)].failure);
if (od) {
stripe->close_write(this);
}
}
ink_assert(!stripe || this_ethread() != stripe->mutex->thread_holding);
// ===========================================================================================================
// Execution reaches here.
// At this point, _action.continuation->handleEvent refers to the CacheVC::scanRemoveDone function.
// ===========================================================================================================
_action.continuation->handleEvent(CACHE_EVENT_REMOVE_FAILED, (void *)-ECACHE_NO_DOC);
goto Lfree;
Lremoved:
_action.continuation->handleEvent(CACHE_EVENT_REMOVE, nullptr);
Lfree:
return free_CacheVC(this);
}
- The
CacheVC::scanRemoveDonefunction does not handle any event. It simply callsCacheVC::scanObjectagain.
int
CacheVC::scanRemoveDone(int /* event ATS_UNUSED */, Event * /* e ATS_UNUSED */)
{
Dbg(dbg_ctl_cache_scan_truss, "inside %p:scanRemoveDone", this);
Dbg(dbg_ctl_cache_scan, "remove done.");
alternate.destroy();
SET_HANDLER(&CacheVC::scanObject);
return handleEvent(EVENT_IMMEDIATE, nullptr);
}
- It goes back to step 1 and repeats the same routine. Eventually, a
core dumpoccurs.
Unfortunately, we don't have any debug logs related to the stripe at the time the cache was stored.
Therefore, we don't know why the stripe value during storage differs from the stripe returned by key_to_stripe() when it is called later.
Temporary Fix
This modification is only for verifying that cache deletion works correctly — performance was not considered.
Please note that this is for reference and testing purposes only.
The PURGE_ISSUE macro is not defined and this logic is not intended for production use.
Action *
Cache::remove(Continuation *cont, const CacheKey *key, CacheFragType type, const char *hostname, int host_len)
{
if (!CacheProcessor::IsCacheReady(type)) {
if (cont) {
cont->handleEvent(CACHE_EVENT_REMOVE_FAILED, nullptr);
}
return ACTION_RESULT_DONE;
}
Ptr<ProxyMutex> mutex;
if (!cont) {
cont = new_CacheRemoveCont();
}
CACHE_TRY_LOCK(lock, cont->mutex, this_ethread());
ink_assert(lock.is_locked());
#ifndef PURGE_ISSUE
Stripe *stripe = nullptr;
#else
Stripe *stripe = key_to_stripe(key, hostname, host_len);
#endif
// coverity[var_decl]
Dir result;
dir_clear(&result); // initialized here, set result empty so we can recognize missed lock
mutex = cont->mutex;
#ifndef PURGE_ISSUE
if (auto *vc = dynamic_cast<CacheVC *>(cont)) {
if (vc->stripe) {
stripe = vc->stripe;
Dbg(dbg_ctl_cache_remove, "Reusing stripe from VC: %p", stripe);
}
}
if (!stripe) {
stripe = key_to_stripe(key, hostname, host_len);
Dbg(dbg_ctl_cache_remove, "Newly calculated stripe: %p", stripe);
}
#endif
CacheVC *c = new_CacheVC(cont);
c->vio.op = VIO::NONE;
c->frag_type = type;
c->op_type = static_cast<int>(CacheOpType::Remove);
Metrics::Gauge::increment(cache_rsb.status[c->op_type].active);
Metrics::Gauge::increment(stripe->cache_vol->vol_rsb.status[c->op_type].active);
c->first_key = c->key = *key;
c->stripe = stripe;
c->dir = result;
c->f.remove = 1;
SET_CONTINUATION_HANDLER(c, &CacheVC::removeEvent);
int ret = c->removeEvent(EVENT_IMMEDIATE, nullptr);
if (ret == EVENT_DONE) {
return ACTION_RESULT_DONE;
} else {
return &c->_action;
}
}
ATS Information
- OS: Ubuntu 22.04
- ats version: 10.0.4
- [hosting.config] : empty
- [cache.config] : empty
- [storage.config]
/dev/sdb id=cache.disk.0
- [volume.config]
volume=1 scheme=http size=10%
volume=2 scheme=http size=10%
volume=3 scheme=http size=10%
volume=4 scheme=http size=10%
volume=5 scheme=http size=10%
volume=6 scheme=http size=10%
volume=7 scheme=http size=10%
volume=8 scheme=http size=10%
volume=9 scheme=http size=10%
We’d appreciate it if you could review this behavior and let us know if this is a known issue or if we’re missing something.
Thank you for your time and support!