/* This function is called when we are going to perform some operation * in a given key, but such key may be already logically expired even if * it still exists in the database. The main way this function is called * is via lookupKey*() family of functions. * * The behavior of the function depends on the replication role of the * instance, because slave instances do not expire keys, they wait * for DELs from the master for consistency matters. However even * slaves will try to have a coherent return value for the function, * so that read commands executed in the slave side will be able to * behave like if the key is expired even if still present (because the * master has yet to propagate the DEL). * * In masters as a side effect of finding a key which is expired, such * key will be evicted from the database. Also this may trigger the * propagation of a DEL/UNLINK command in AOF / replication stream. * * The return value of the function is 0 if the key is still valid, * otherwise the function returns 1 if the key is expired. */ intexpireIfNeeded(redisDb *db, robj *key){ if (!keyIsExpired(db,key)) return0;
/* If we are running in the context of a slave, instead of * evicting the expired key from the database, we return ASAP: * the slave key expiration is controlled by the master that will * send us synthesized DEL operations for expired keys. * * Still we try to return the right information to the caller, * that is, 0 if we think the key should be still valid, 1 if * we think the key is expired at this time. */ if (server.masterhost != NULL) return1;
/* Check if the key is expired. */ intkeyIsExpired(redisDb *db, robj *key){ mstime_t when = getExpire(db,key); mstime_t now;
if (when < 0) return0; /* No expire for this key */
/* Don't expire anything while loading. It will be done later. */ if (server.loading) return0;
/* If we are in the context of a Lua script, we pretend that time is * blocked to when the Lua script started. This way a key can expire * only the first time it is accessed and not in the middle of the * script execution, making propagation to slaves / AOF consistent. * See issue #1525 on Github for more information. */ if (server.lua_caller) { now = server.lua_time_start; } /* If we are in the middle of a command execution, we still want to use * a reference time that does not change: in that case we just use the * cached time, that we update before each call in the call() function. * This way we avoid that commands such as RPOPLPUSH or similar, that * may re-open the same key multiple times, can invalidate an already * open object in a next call, if the next call will see the key expired, * while the first did not. */ elseif (server.fixed_time_expire > 0) { now = server.mstime; } /* For the other cases, we want to use the most fresh time we have. */ else { now = mstime(); }
/* The key expired if the current (virtual or real) time is greater * than the expire time of the key. */ return now > when; } /* Return the expire time of the specified key, or -1 if no expire * is associated with this key (i.e. the key is non volatile) */ longlonggetExpire(redisDb *db, robj *key){ dictEntry *de;
/* No expire? return ASAP */ if (dictSize(db->expires) == 0 || (de = dictFind(db->expires,key->ptr)) == NULL) return-1;
/* The entry was found in the expire dict, this means it should also * be present in the main dict (safety check). */ serverAssertWithInfo(NULL,key,dictFind(db->dict,key->ptr) != NULL); return dictGetSignedIntegerVal(de); }
/* This function handles 'background' operations we are required to do * incrementally in Redis databases, such as active key expiring, resizing, * rehashing. */ voiddatabasesCron(void){ /* Expire keys by random sampling. Not required for slaves * as master will synthesize DELs for us. */ if (server.active_expire_enabled) { if (server.masterhost == NULL) { activeExpireCycle(ACTIVE_EXPIRE_CYCLE_SLOW); } else { expireSlaveKeys(); } }
/* Defrag keys gradually. */ activeDefragCycle();
/* Perform hash tables rehashing if needed, but only if there are no * other processes saving the DB on disk. Otherwise rehashing is bad * as will cause a lot of copy-on-write of memory pages. */ if (!hasActiveChildProcess()) { /* We use global counters so if we stop the computation at a given * DB we'll be able to start from the successive in the next * cron loop iteration. */ staticunsignedint resize_db = 0; staticunsignedint rehash_db = 0; int dbs_per_call = CRON_DBS_PER_CALL; int j;
/* Don't test more DBs than we have. */ if (dbs_per_call > server.dbnum) dbs_per_call = server.dbnum;
/* Rehash */ if (server.activerehashing) { for (j = 0; j < dbs_per_call; j++) { int work_done = incrementallyRehash(rehash_db); if (work_done) { /* If the function did some work, stop here, we'll do * more at the next cron loop. */ break; } else { /* If this db didn't need rehash, we'll try the next one. */ rehash_db++; rehash_db %= server.dbnum; } } } } } /* Try to expire a few timed out keys. The algorithm used is adaptive and * will use few CPU cycles if there are few expiring keys, otherwise * it will get more aggressive to avoid that too much memory is used by * keys that can be removed from the keyspace. * * Every expire cycle tests multiple databases: the next call will start * again from the next db, with the exception of exists for time limit: in that * case we restart again from the last database we were processing. Anyway * no more than CRON_DBS_PER_CALL databases are tested at every iteration. * * The function can perform more or less work, depending on the "type" * argument. It can execute a "fast cycle" or a "slow cycle". The slow * cycle is the main way we collect expired cycles: this happens with * the "server.hz" frequency (usually 10 hertz). * * However the slow cycle can exit for timeout, since it used too much time. * For this reason the function is also invoked to perform a fast cycle * at every event loop cycle, in the beforeSleep() function. The fast cycle * will try to perform less work, but will do it much more often. * * The following are the details of the two expire cycles and their stop * conditions: * * If type is ACTIVE_EXPIRE_CYCLE_FAST the function will try to run a * "fast" expire cycle that takes no longer than EXPIRE_FAST_CYCLE_DURATION * microseconds, and is not repeated again before the same amount of time. * The cycle will also refuse to run at all if the latest slow cycle did not * terminate because of a time limit condition. * * If type is ACTIVE_EXPIRE_CYCLE_SLOW, that normal expire cycle is * executed, where the time limit is a percentage of the REDIS_HZ period * as specified by the ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC define. In the * fast cycle, the check of every database is interrupted once the number * of already expired keys in the database is estimated to be lower than * a given percentage, in order to avoid doing too much work to gain too * little memory. * * The configured expire "effort" will modify the baseline parameters in * order to do more work in both the fast and slow expire cycles. */
#define ACTIVE_EXPIRE_CYCLE_KEYS_PER_LOOP 20 /* Keys for each DB loop. */ #define ACTIVE_EXPIRE_CYCLE_FAST_DURATION 1000 /* Microseconds. */ #define ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC 25 /* Max % of CPU to use. */ #define ACTIVE_EXPIRE_CYCLE_ACCEPTABLE_STALE 10 /* % of stale keys after which we do extra efforts. */ voidactiveExpireCycle(int type){ /* Adjust the running parameters according to the configured expire * effort. The default effort is 1, and the maximum configurable effort * is 10. */ unsignedlong effort = server.active_expire_effort-1, /* Rescale from 0 to 9. */ config_keys_per_loop = ACTIVE_EXPIRE_CYCLE_KEYS_PER_LOOP + ACTIVE_EXPIRE_CYCLE_KEYS_PER_LOOP/4*effort, config_cycle_fast_duration = ACTIVE_EXPIRE_CYCLE_FAST_DURATION + ACTIVE_EXPIRE_CYCLE_FAST_DURATION/4*effort, config_cycle_slow_time_perc = ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC + 2*effort, config_cycle_acceptable_stale = ACTIVE_EXPIRE_CYCLE_ACCEPTABLE_STALE- effort;
/* This function has some global state in order to continue the work * incrementally across calls. */ staticunsignedint current_db = 0; /* Last DB tested. */ staticint timelimit_exit = 0; /* Time limit hit in previous call? */ staticlonglong last_fast_cycle = 0; /* When last fast cycle ran. */
int j, iteration = 0; int dbs_per_call = CRON_DBS_PER_CALL; longlong start = ustime(), timelimit, elapsed;
/* When clients are paused the dataset should be static not just from the * POV of clients not being able to write, but also from the POV of * expires and evictions of keys not being performed. */ if (clientsArePaused()) return;
if (type == ACTIVE_EXPIRE_CYCLE_FAST) { /* Don't start a fast cycle if the previous cycle did not exit * for time limit, unless the percentage of estimated stale keys is * too high. Also never repeat a fast cycle for the same period * as the fast cycle total duration itself. */ if (!timelimit_exit && server.stat_expired_stale_perc < config_cycle_acceptable_stale) return;
if (start < last_fast_cycle + (longlong)config_cycle_fast_duration*2) return;
last_fast_cycle = start; }
/* We usually should test CRON_DBS_PER_CALL per iteration, with * two exceptions: * * 1) Don't test more DBs than we have. * 2) If last time we hit the time limit, we want to scan all DBs * in this iteration, as there is work to do in some DB and we don't want * expired keys to use memory for too much time. */ if (dbs_per_call > server.dbnum || timelimit_exit) dbs_per_call = server.dbnum;
/* We can use at max 'config_cycle_slow_time_perc' percentage of CPU * time per iteration. Since this function gets called with a frequency of * server.hz times per second, the following is the max amount of * microseconds we can spend in this function. */ timelimit = config_cycle_slow_time_perc*1000000/server.hz/100; timelimit_exit = 0; if (timelimit <= 0) timelimit = 1;
if (type == ACTIVE_EXPIRE_CYCLE_FAST) timelimit = config_cycle_fast_duration; /* in microseconds. */
/* Accumulate some global stats as we expire keys, to have some idea * about the number of keys that are already logically expired, but still * existing inside the database. */ long total_sampled = 0; long total_expired = 0;
for (j = 0; j < dbs_per_call && timelimit_exit == 0; j++) { /* Expired and checked in a single loop. */ unsignedlong expired, sampled;
/* Increment the DB now so we are sure if we run out of time * in the current DB we'll restart from the next. This allows to * distribute the time evenly across DBs. */ current_db++;
/* Continue to expire if at the end of the cycle more than 25% * of the keys were expired. */ do { unsignedlong num, slots; longlong now, ttl_sum; int ttl_samples; iteration++;
/* If there is nothing to expire try next DB ASAP. */ if ((num = dictSize(db->expires)) == 0) { db->avg_ttl = 0; break; } slots = dictSlots(db->expires); now = mstime();
/* When there are less than 1% filled slots, sampling the key * space is expensive, so stop here waiting for better times... * The dictionary will be resized asap. */ if (num && slots > DICT_HT_INITIAL_SIZE && (num*100/slots < 1)) break;
/* The main collection cycle. Sample random keys among keys * with an expire set, checking for expired ones. */ expired = 0; sampled = 0; ttl_sum = 0; ttl_samples = 0;
if (num > config_keys_per_loop) num = config_keys_per_loop;
/* Here we access the low level representation of the hash table * for speed concerns: this makes this code coupled with dict.c, * but it hardly changed in ten years. * * Note that certain places of the hash table may be empty, * so we want also a stop condition about the number of * buckets that we scanned. However scanning for free buckets * is very fast: we are in the cache line scanning a sequential * array of NULL pointers, so we can scan a lot more buckets * than keys in the same time. */ long max_buckets = num*20; long checked_buckets = 0;
while (sampled < num && checked_buckets < max_buckets) { for (int table = 0; table < 2; table++) { if (table == 1 && !dictIsRehashing(db->expires)) break;
/* Scan the current bucket of the current table. */ checked_buckets++; while(de) { /* Get the next entry now since this entry may get * deleted. */ dictEntry *e = de; de = de->next;
ttl = dictGetSignedIntegerVal(e)-now; if (activeExpireCycleTryExpire(db,e,now)) expired++; if (ttl > 0) { /* We want the average TTL of keys yet * not expired. */ ttl_sum += ttl; ttl_samples++; } sampled++; } } db->expires_cursor++; } total_expired += expired; total_sampled += sampled;
/* Update the average TTL stats for this database. */ if (ttl_samples) { longlong avg_ttl = ttl_sum/ttl_samples;
/* Do a simple running average with a few samples. * We just use the current estimate with a weight of 2% * and the previous estimate with a weight of 98%. */ if (db->avg_ttl == 0) db->avg_ttl = avg_ttl; db->avg_ttl = (db->avg_ttl/50)*49 + (avg_ttl/50); }
/* We can't block forever here even if there are many keys to * expire. So after a given amount of milliseconds return to the * caller waiting for the other active expire cycle. */ if ((iteration & 0xf) == 0) { /* check once every 16 iterations. */ elapsed = ustime()-start; if (elapsed > timelimit) { timelimit_exit = 1; server.stat_expired_time_cap_reached_count++; break; } } /* We don't repeat the cycle for the current database if there are * an acceptable amount of stale keys (logically expired but yet * not reclained). */ } while ((expired*100/sampled) > config_cycle_acceptable_stale); }
/* Update our estimate of keys existing but yet to be expired. * Running average with this sample accounting for 5%. */ double current_perc; if (total_sampled) { current_perc = (double)total_expired/total_sampled; } else current_perc = 0; server.stat_expired_stale_perc = (current_perc*0.05)+ (server.stat_expired_stale_perc*0.95); }