Files
Arquivotheca.Solaris-2.5/uts/common/os/kmem.c
seta75D 7c4988eac0 Init
2021-10-11 19:38:01 -03:00

2310 lines
64 KiB
C
Executable File

/*
* Copyright (c) 1993 by Sun Microsystems, Inc.
*/
#pragma ident "@(#)kmem.c 1.25 95/10/11 SMI"
/*
* Kernel memory allocator, as described in:
*
* Jeff Bonwick,
* The Slab Allocator: An Object-Caching Kernel Memory Allocator.
* Proceedings of the Summer 1994 Usenix Conference.
*
* See /shared/sac/PSARC/1994/028 for copies of the paper and
* related design documentation.
*/
#include <sys/kmem.h>
#include <sys/kmem_impl.h>
#include <sys/param.h>
#include <sys/sysmacros.h>
#include <sys/vm.h>
#include <sys/proc.h>
#include <sys/tuneable.h>
#include <sys/systm.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/mutex.h>
#include <sys/bitmap.h>
#include <sys/vtrace.h>
#include <sys/kobj.h>
#include <vm/page.h>
#include <vm/seg_kmem.h>
#include <sys/map.h>
extern void prom_printf(char *fmt, ...);
extern char Syslimit[];
extern char Sysbase[];
struct kmem_cpu_kstat {
kstat_named_t kmcpu_alloc_from;
kstat_named_t kmcpu_free_to;
kstat_named_t kmcpu_buf_avail;
} kmem_cpu_kstat_template = {
{ "alloc_from_cpu%d", KSTAT_DATA_LONG },
{ "free_to_cpu%d", KSTAT_DATA_LONG },
{ "buf_avail_cpu%d", KSTAT_DATA_LONG },
};
struct kmem_cpu_kstat *kmem_cpu_kstat;
struct kmem_cache_kstat {
kstat_named_t kmc_buf_size;
kstat_named_t kmc_align;
kstat_named_t kmc_chunk_size;
kstat_named_t kmc_slab_size;
kstat_named_t kmc_alloc;
kstat_named_t kmc_alloc_fail;
kstat_named_t kmc_depot_alloc;
kstat_named_t kmc_depot_free;
kstat_named_t kmc_depot_contention;
kstat_named_t kmc_global_alloc;
kstat_named_t kmc_buf_avail;
kstat_named_t kmc_buf_total;
kstat_named_t kmc_buf_max;
kstat_named_t kmc_slab_create;
kstat_named_t kmc_slab_destroy;
kstat_named_t kmc_hash_size;
kstat_named_t kmc_hash_lookup_depth;
kstat_named_t kmc_hash_rescale;
kstat_named_t kmc_full_magazines;
kstat_named_t kmc_empty_magazines;
kstat_named_t kmc_magazine_size;
} kmem_cache_kstat_template = {
{ "buf_size", KSTAT_DATA_LONG },
{ "align", KSTAT_DATA_LONG },
{ "chunk_size", KSTAT_DATA_LONG },
{ "slab_size", KSTAT_DATA_LONG },
{ "alloc", KSTAT_DATA_LONG },
{ "alloc_fail", KSTAT_DATA_LONG },
{ "depot_alloc", KSTAT_DATA_LONG },
{ "depot_free", KSTAT_DATA_LONG },
{ "depot_contention", KSTAT_DATA_LONG },
{ "global_alloc", KSTAT_DATA_LONG },
{ "buf_avail", KSTAT_DATA_LONG },
{ "buf_total", KSTAT_DATA_LONG },
{ "buf_max", KSTAT_DATA_LONG },
{ "slab_create", KSTAT_DATA_LONG },
{ "slab_destroy", KSTAT_DATA_LONG },
{ "hash_size", KSTAT_DATA_LONG },
{ "hash_lookup_depth", KSTAT_DATA_LONG },
{ "hash_rescale", KSTAT_DATA_LONG },
{ "full_magazines", KSTAT_DATA_LONG },
{ "empty_magazines", KSTAT_DATA_LONG },
{ "magazine_size", KSTAT_DATA_LONG },
};
struct kmem_cache_kstat *kmem_cache_kstat;
struct {
kstat_named_t arena_size;
kstat_named_t huge_alloc;
kstat_named_t huge_alloc_fail;
kstat_named_t perm_size;
kstat_named_t perm_alloc;
kstat_named_t perm_alloc_fail;
} kmem_misc_kstat = {
{ "arena_size", KSTAT_DATA_LONG },
{ "huge_alloc", KSTAT_DATA_LONG },
{ "huge_alloc_fail", KSTAT_DATA_LONG },
{ "perm_size", KSTAT_DATA_LONG },
{ "perm_alloc", KSTAT_DATA_LONG },
{ "perm_alloc_fail", KSTAT_DATA_LONG },
};
/*
* The default set of caches to back kmem_alloc().
* These sizes should be reevaluated periodically.
*/
static int kmem_alloc_sizes[] = {
8,
16, 24,
32, 40, 48, 56,
64, 80, 96, 112,
128, 144, 160, 176, 192, 208, 224, 240,
256, 320, 384, 448,
512, 576, 672, 800,
1024, 1152, 1344, 1632,
2048, 2720,
4096, 5440, 6144, 6816,
8192, 9536, 12288,
16384
};
#define KMEM_MAXBUF 16384
static kmem_cache_t *kmem_alloc_table[KMEM_MAXBUF >> KMEM_ALIGN_SHIFT];
/*
* The magazine types for fast per-cpu allocation
*/
typedef struct kmem_magazine_type {
int mt_magsize; /* magazine size (number of rounds) */
int mt_align; /* magazine alignment */
int mt_minbuf; /* all smaller buffers qualify */
int mt_maxbuf; /* no larger buffers qualify */
kmem_cache_t *mt_cache;
} kmem_magazine_type_t;
kmem_magazine_type_t kmem_magazine_type[] = {
{ 0, 0, 16384, 16384 },
{ 1, 8, 3200, 8192 },
{ 3, 16, 256, 4096 },
{ 7, 32, 64, 2048 },
{ 15, 64, 0, 1024 },
{ 31, 64, 0, 512 },
{ 47, 64, 0, 256 },
{ 63, 64, 0, 128 },
{ 95, 64, 0, 64 },
{ 143, 64, 0, 0 },
};
static int kmem_reap_lasttime; /* time of last reap */
int kmem_reap_interval; /* don't allow more frequent reaping */
int kmem_depot_contention; /* max failed depot tryenters/minute */
int kmem_reapahead = 0; /* diff of kmem_reap and pageout thresholds */
int kmem_maxraw = -1;
int kmem_align = KMEM_ALIGN; /* minimum alignment for all caches */
int kmem_panic = 1; /* by default, panic on error */
int kmem_logging = 1;
#ifdef DEBUG
int kmem_flags = KMF_AUDIT | KMF_DEADBEEF | KMF_REDZONE | KMF_VERIFY;
int kmem_fast_debug = 1;
#else
int kmem_flags = 0;
int kmem_fast_debug = 0;
#endif
int kmem_ready;
static kmem_cache_t *kmem_slab_cache;
static kmem_cache_t *kmem_bufctl_cache;
static kmutex_t kmem_cache_lock; /* inter-cache linkage only */
static kmem_cache_t *kmem_cache_freelist;
kmem_cache_t kmem_null_cache;
static kmem_bufctl_t kmem_null_bufctl;
static kmutex_t kmem_async_lock;
static kmem_async_t *kmem_async_freelist;
static kmem_async_t kmem_async_queue;
static kcondvar_t kmem_async_cv;
static kmutex_t kmem_async_serialize;
static kmutex_t kmem_perm_lock;
static kmem_perm_t *kmem_perm_freelist;
kmem_bufctl_audit_t *kmem_log; /* for buffer auditing, if enabled */
int kmem_log_size;
int kmem_log_count;
int kmem_log_display = 2; /* # to show in kmem_error */
#define KMEM_BZERO_INLINE 48 /* do inline bzero for smaller bufs */
#define KMEM_BZERO(bufarg, sizearg) \
{ \
u_int Xsize = (u_int)(sizearg); \
u_longlong_t *Xbuf = (u_longlong_t *)(bufarg); \
if (Xsize > KMEM_BZERO_INLINE) { \
bzero((void *)Xbuf, Xsize); \
} else { \
u_longlong_t *Xend; \
Xend = (u_longlong_t *)((char *)Xbuf + Xsize); \
while (Xbuf < Xend) \
*Xbuf++ = 0LL; \
} \
}
#define KMERR_MODIFIED 0 /* buffer modified while on freelist */
#define KMERR_REDZONE 1 /* redzone violation (write past end of buf) */
#define KMERR_BADADDR 2 /* freed a bad (unallocated) address */
#define KMERR_DUPFREE 3 /* freed a buffer twice */
#define KMERR_BADBUFTAG 4 /* buftag corrupted */
#define KMERR_BADBUFCTL 5 /* bufctl corrupted */
#define KMERR_BADCACHE 6 /* freed a buffer to the wrong cache */
struct {
hrtime_t kmp_timestamp; /* timestamp of panic */
int kmp_error; /* type of kmem error */
int kmp_count; /* index into kmem_log at panic */
kmem_cache_t *kmp_cache; /* buffer's cache */
void *kmp_buffer; /* buffer that induced panic */
kmem_bufctl_t *kmp_bufctl; /* buffer's bufctl */
kmem_bufctl_t *kmp_realbcp; /* bufctl according to kmem_locate() */
} kmem_panic_info;
static void
copy_pattern(u_long pattern, void *buf_arg, int size)
{
u_long *bufend = (u_long *)((char *)buf_arg + size);
u_long *buf;
for (buf = buf_arg; buf < bufend; buf++)
*buf = pattern;
}
static void *
verify_pattern(u_long pattern, void *buf_arg, int size)
{
u_long *bufend = (u_long *)((char *)buf_arg + size);
u_long *buf;
for (buf = buf_arg; buf < bufend; buf++)
if (*buf != pattern)
return (buf);
return (NULL);
}
/*
* Debugging support. Given any buffer address, find its bufctl
* by searching every cache in the system.
*/
static kmem_bufctl_t *
kmem_locate(void *buf)
{
kmem_cache_t *cp;
kmem_bufctl_t *bcp = NULL;
mutex_enter(&kmem_cache_lock);
for (cp = kmem_null_cache.cache_next; cp != &kmem_null_cache;
cp = cp->cache_next) {
if (!(cp->cache_flags & KMF_HASH))
continue;
mutex_enter(&cp->cache_lock);
for (bcp = *KMEM_HASH(cp, buf); bcp != NULL; bcp = bcp->bc_next)
if (bcp->bc_addr == buf)
break;
mutex_exit(&cp->cache_lock);
if (bcp != NULL)
break;
}
mutex_exit(&kmem_cache_lock);
return (bcp);
}
static void
kmem_bufctl_display(kmem_bufctl_audit_t *bcp)
{
int d;
timestruc_t ts;
hrt2ts(kmem_panic_info.kmp_timestamp - bcp->bc_timestamp, &ts);
prom_printf("\nthread=%x time=T-%d.%9d slab=%x cache: %s\n",
bcp->bc_thread, ts.tv_sec, ts.tv_nsec,
bcp->bc_slab, bcp->bc_cache->cache_name);
for (d = 0; d < bcp->bc_depth; d++) {
u_int off;
char *sym = kobj_getsymname(bcp->bc_stack[d], &off);
prom_printf("%s+%x\n", sym ? sym : "?", off);
}
}
static void
kmem_error(int error, kmem_cache_t *cp, void *buf, kmem_bufctl_t *bcp)
{
kmem_bufctl_audit_t *logp, logcopy;
kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
u_long *off;
kmem_logging = 0; /* stop logging when a bad thing happens */
kmem_panic_info.kmp_timestamp = gethrtime();
kmem_panic_info.kmp_error = error;
kmem_panic_info.kmp_count = kmem_log_count;
kmem_panic_info.kmp_cache = cp;
kmem_panic_info.kmp_buffer = buf;
kmem_panic_info.kmp_bufctl = bcp;
prom_printf("kernel memory allocator: ");
switch (error) {
case KMERR_MODIFIED:
prom_printf("buffer modified after being freed\n");
off = verify_pattern(0xdeadbeef, buf,
cp ? cp->cache_offset : ((int *)buf)[-1]);
if (off == NULL) /* shouldn't happen */
off = buf;
prom_printf("modification occurred at offset 0x%x "
"(0x%x replaced by 0x%x)\n",
(int)off - (int)buf, 0xdeadbeef, *off);
break;
case KMERR_REDZONE:
prom_printf("redzone violation: write past end of buffer\n");
break;
case KMERR_BADADDR:
bcp = kmem_locate(buf);
if (bcp && bcp->bc_slab->slab_cache != cp) {
kmem_panic_info.kmp_error = KMERR_BADCACHE;
prom_printf("buffer freed to wrong cache\n");
prom_printf("buffer was allocated from cache %s,\n",
bcp->bc_slab->slab_cache->cache_name);
prom_printf("caller attempting free to cache %s.\n",
cp->cache_name);
} else {
prom_printf("invalid free: buffer not in cache\n");
}
break;
case KMERR_DUPFREE:
prom_printf("duplicate free: buffer freed twice\n");
break;
case KMERR_BADBUFTAG:
prom_printf("boundary tag corrupted\n");
prom_printf("bcp ^ bxstat = %x, should be %x\n",
(int)bcp ^ btp->bt_bxstat, KMEM_BUFTAG_FREE);
bcp = kmem_locate(buf);
break;
case KMERR_BADBUFCTL:
prom_printf("bufctl corrupted\n");
prom_printf("bcp->bc_addr = %x, should be %x\n",
bcp->bc_addr, buf);
bcp = kmem_locate(buf);
break;
}
kmem_panic_info.kmp_realbcp = bcp;
prom_printf("buffer=%x bufctl=%x cache: %s\n",
buf, bcp, cp ? cp->cache_name : "none");
if (cp && (cp->cache_flags & KMF_AUDIT)) {
int i;
int count = 0;
prom_printf("previous transactions on buffer %x:\n", buf);
for (i = kmem_log_count + kmem_log_size - 1;
i > kmem_log_count; i--) {
logp = &kmem_log[i & (kmem_log_size - 1)];
mutex_enter(&kmem_cache_lock);
if (logp->bc_addr != buf) {
mutex_exit(&kmem_cache_lock);
continue;
}
logcopy = *logp;
mutex_exit(&kmem_cache_lock);
logp = &logcopy;
kmem_bufctl_display(logp);
if (++count >= kmem_log_display) {
if (logp->bc_lastlog)
prom_printf("\n(transaction log "
"continues at %x)\n",
logp->bc_lastlog);
break;
}
}
/*
* If we didn't find any history in the log (because
* it wrapped since the last transaction on this buffer),
* we can still get the last transaction from the bufctl.
*/
if (count == 0 && kmem_log_display > 0) {
logp = (kmem_bufctl_audit_t *)kmem_locate(buf);
if (logp != NULL)
kmem_bufctl_display(logp);
}
}
if (kmem_panic)
cmn_err(CE_PANIC, "kernel heap corruption detected");
debug_enter(NULL);
kmem_logging = 1; /* resume logging */
}
/*
* Get pages from the VM system and update all the bean counters.
*/
static void *
kmem_page_alloc(int npages, int flags)
{
void *pages;
mutex_enter(&freemem_lock);
while ((availrmem - npages < tune.t_minarmem) ||
(availsmem - npages < tune.t_minasmem)) {
mutex_exit(&freemem_lock);
if (flags & KM_NOSLEEP)
return (NULL);
/*
* We're out of memory. It would be nice if there
* were something appropriate to cv_wait() for,
* but there are currently many ways for pages to
* come and go -- there's no reliable, centralized
* notification mechanism. So, we just hang out
* for a moment, give pageout a chance to run,
* and try again. It's lame, but this situation is
* rare in practice -- all we're really trying to do
* here is unwedge the system if it gets stuck.
*/
kmem_reap();
delay(HZ >> 2);
mutex_enter(&freemem_lock);
}
availrmem -= npages;
availsmem -= npages;
pages_pp_kernel += npages;
kmem_misc_kstat.arena_size.value.l += ptob(npages);
mutex_exit(&freemem_lock);
if ((pages = kmem_getpages(npages, KM_NOSLEEP)) == NULL) {
/*
* We couldn't get pages with KM_NOSLEEP. This might be
* because we're out of physical memory, but it could also
* be because we're out of kernelmap. In the latter case,
* kmem_reap() is the only potential salvation.
*/
if (flags & KM_NOSLEEP) {
mutex_enter(&freemem_lock);
availrmem += npages;
availsmem += npages;
pages_pp_kernel -= npages;
kmem_misc_kstat.arena_size.value.l -= ptob(npages);
mutex_exit(&freemem_lock);
} else {
kmem_reap();
pages = kmem_getpages(npages, KM_SLEEP);
ASSERT(pages != NULL);
}
}
return (pages);
}
/*
* Give pages back to the VM system and update all the bean counters.
*/
static void
kmem_page_free(void *pages, int npages)
{
kmem_freepages(pages, npages);
mutex_enter(&freemem_lock);
availrmem += npages;
availsmem += npages;
pages_pp_kernel -= npages;
kmem_misc_kstat.arena_size.value.l -= ptob(npages);
mutex_exit(&freemem_lock);
}
/*
* XXX -- This interface can't express the availability of more than 4GB of
* kernel memory, so just truncate it to 1GB if there's more. (Truncation to
* 1GB rather than 2GB or 4GB protects the caller from arithmetic overflow.)
* kmem_avail() is used to make policy decisions when memory is low, not when
* memory is plentiful, so truncation hardly matters. Convert this to return
* u_longlong_t in 2.6.
*/
u_long
kmem_avail(void)
{
int rmem = (int)(availrmem - tune.t_minarmem);
int fmem = (int)(freemem - minfree);
int pages_avail = min(max(min(rmem, fmem), 0), 1 << (30 - PAGESHIFT));
return (ptob(pages_avail));
}
/*
* Return the maximum amount of memory that is (in theory) allocatable
* from the heap. This may be used as an estimate only since there
* is no guarentee this space will still be available when an allocation
* request is made, nor that the space may be allocated in one big request
* due to kernelmap fragmentation.
*/
u_longlong_t
kmem_maxavail(void)
{
int max_phys = (int)(availrmem - tune.t_minarmem);
int max_virt = (int)kmem_maxvirt();
int pages_avail = max(min(max_phys, max_virt), 0);
return ((u_longlong_t)pages_avail << PAGESHIFT);
}
/*
* Allocate memory permanently.
*/
void *
kmem_perm_alloc(size_t size, int align, int flags)
{
kmem_perm_t *pp, **prev_ppp, **best_prev_ppp;
char *buf;
int best_avail = INT_MAX;
if (align < KMEM_ALIGN)
align = KMEM_ALIGN;
if ((align & (align - 1)) || align > PAGESIZE)
cmn_err(CE_PANIC, "kmem_perm_alloc: bad alignment %d", align);
size = (size + align - 1) & -align;
mutex_enter(&kmem_perm_lock);
kmem_misc_kstat.perm_alloc.value.l++;
best_prev_ppp = NULL;
for (prev_ppp = &kmem_perm_freelist; (pp = *prev_ppp) != NULL;
prev_ppp = &pp->perm_next) {
if (pp->perm_avail - (-(int)pp->perm_current & (align - 1)) >=
size && pp->perm_avail < best_avail) {
best_prev_ppp = prev_ppp;
best_avail = pp->perm_avail;
}
}
if ((prev_ppp = best_prev_ppp) == NULL) {
int npages = btopr(size + sizeof (kmem_perm_t));
mutex_exit(&kmem_perm_lock);
buf = kmem_page_alloc(npages, flags);
if (buf == NULL) {
kmem_misc_kstat.perm_alloc_fail.value.l++;
return (NULL);
}
mutex_enter(&kmem_perm_lock);
kmem_misc_kstat.perm_size.value.l += ptob(npages);
pp = (kmem_perm_t *)buf;
pp->perm_next = kmem_perm_freelist;
pp->perm_current = buf + sizeof (kmem_perm_t);
pp->perm_avail = ptob(npages) - sizeof (kmem_perm_t);
kmem_perm_freelist = pp;
prev_ppp = &kmem_perm_freelist;
}
pp = *prev_ppp;
buf = (char *)(((int)pp->perm_current + align - 1) & -align);
pp->perm_avail = pp->perm_avail - (buf + size - pp->perm_current);
pp->perm_current = buf + size;
if (pp->perm_avail < KMEM_PERM_MINFREE)
*prev_ppp = pp->perm_next;
mutex_exit(&kmem_perm_lock);
return (buf);
}
static kmem_slab_t *
kmem_slab_create(kmem_cache_t *cp, int flags)
{
int bufsize = cp->cache_bufsize;
int chunksize = cp->cache_chunksize;
int cache_flags = cp->cache_flags;
int color, chunks;
void (*constructor)(void *, size_t) = cp->cache_constructor;
void (*destructor)(void *, size_t) = cp->cache_destructor;
char *buf, *base, *limit;
kmem_slab_t *sp;
kmem_bufctl_t *bcp;
ASSERT(MUTEX_HELD(&cp->cache_lock));
TRACE_2(TR_FAC_KMEM, TR_KMEM_SLAB_CREATE_START,
"kmem_slab_create_start:cache %S flags %x",
cp->cache_name, flags);
if ((color = cp->cache_color += cp->cache_align) > cp->cache_maxcolor)
color = cp->cache_color = 0;
mutex_exit(&cp->cache_lock);
if ((base = kmem_page_alloc(btop(cp->cache_slabsize), flags)) == NULL)
goto page_alloc_failure;
if (cache_flags & KMF_DEADBEEF) {
copy_pattern(0xdeadbeef, base, cp->cache_slabsize);
constructor = destructor = NULL;
}
if (cache_flags & KMF_HASH) {
limit = base + cp->cache_slabsize;
if ((sp = kmem_cache_alloc(kmem_slab_cache, flags)) == NULL)
goto slab_alloc_failure;
} else {
limit = base + cp->cache_slabsize - sizeof (kmem_slab_t);
sp = (kmem_slab_t *)limit;
}
sp->slab_cache = cp;
sp->slab_head = NULL;
sp->slab_refcnt = 0;
sp->slab_base = buf = base + color;
chunks = 0;
while (buf + chunksize <= limit) {
if (cache_flags & KMF_HASH) {
bcp = kmem_cache_alloc(kmem_bufctl_cache, flags);
if (bcp == NULL)
goto bufctl_alloc_failure;
if (cache_flags & KMF_AUDIT) {
bzero((void *)bcp,
sizeof (kmem_bufctl_audit_t));
((kmem_bufctl_audit_t *)bcp)->bc_cache = cp;
}
bcp->bc_addr = buf;
bcp->bc_slab = sp;
} else {
bcp = (kmem_bufctl_t *)((int)buf + cp->cache_offset);
}
if (cache_flags & KMF_BUFTAG) {
kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
copy_pattern(KMEM_REDZONE_WORD, btp->bt_redzone,
KMEM_REDZONE_SIZE * sizeof (void *));
btp->bt_bufctl = bcp;
btp->bt_bxstat = (int)bcp ^ KMEM_BUFTAG_FREE;
}
bcp->bc_next = sp->slab_head;
sp->slab_head = bcp;
if (chunks == 0)
sp->slab_tail = bcp;
if (constructor != NULL)
(*constructor)(buf, bufsize);
buf += chunksize;
chunks++;
}
sp->slab_chunks = chunks;
mutex_enter(&cp->cache_lock);
cp->cache_slab_create++;
cp->cache_buftotal += sp->slab_chunks;
if (cp->cache_buftotal > cp->cache_bufmax)
cp->cache_bufmax = cp->cache_buftotal;
TRACE_1(TR_FAC_KMEM, TR_KMEM_SLAB_CREATE_END,
"kmem_slab_create_end:slab %x", sp);
return (sp);
bufctl_alloc_failure:
while ((bcp = sp->slab_head) != NULL) {
if (destructor != NULL)
(*destructor)(bcp->bc_addr, bufsize);
sp->slab_head = bcp->bc_next;
kmem_cache_free(kmem_bufctl_cache, bcp);
}
kmem_cache_free(kmem_slab_cache, sp);
slab_alloc_failure:
kmem_page_free(base, btop(cp->cache_slabsize));
page_alloc_failure:
mutex_enter(&cp->cache_lock);
TRACE_1(TR_FAC_KMEM, TR_KMEM_SLAB_CREATE_END,
"kmem_slab_create_end:slab %x", sp);
return (NULL);
}
static void
kmem_slab_destroy(kmem_cache_t *cp, kmem_slab_t *sp)
{
int bufsize = cp->cache_bufsize;
int cache_flags = cp->cache_flags;
void (*destructor)(void *, size_t) = cp->cache_destructor;
kmem_bufctl_t *bcp, *next_bcp;
void *buf, *base;
ASSERT(MUTEX_HELD(&cp->cache_lock));
TRACE_2(TR_FAC_KMEM, TR_KMEM_SLAB_DESTROY_START,
"kmem_slab_destroy_start:cache %S slab %x", cp->cache_name, sp);
cp->cache_slab_destroy++;
cp->cache_buftotal -= sp->slab_chunks;
mutex_exit(&cp->cache_lock);
if (cache_flags & KMF_DEADBEEF)
destructor = NULL;
next_bcp = sp->slab_head;
sp->slab_tail->bc_next = NULL; /* normally a garbage pointer */
while ((bcp = next_bcp) != NULL) {
next_bcp = bcp->bc_next;
if (cache_flags & KMF_HASH) {
buf = bcp->bc_addr;
kmem_cache_free(kmem_bufctl_cache, bcp);
} else {
buf = (void *)((int)bcp - cp->cache_offset);
}
if (destructor != NULL)
(*destructor)(buf, bufsize);
}
base = (void *)((int)sp->slab_base & -PAGESIZE);
if (cache_flags & KMF_HASH)
kmem_cache_free(kmem_slab_cache, sp);
kmem_page_free(base, btop(cp->cache_slabsize));
mutex_enter(&cp->cache_lock);
TRACE_0(TR_FAC_KMEM, TR_KMEM_SLAB_DESTROY_END, "kmem_slab_destroy_end");
}
static void
kmem_audit(kmem_bufctl_audit_t *bcp)
{
kmem_bufctl_audit_t *logp;
bcp->bc_timestamp = gethrtime();
bcp->bc_thread = curthread;
bcp->bc_depth = getpcstack(bcp->bc_stack, KMEM_STACK_DEPTH);
if (kmem_logging == 0)
return;
mutex_enter(&kmem_cache_lock);
logp = &kmem_log[kmem_log_count++ & (kmem_log_size - 1)];
*logp = *bcp;
bcp->bc_lastlog = logp;
mutex_exit(&kmem_cache_lock);
}
static void
kmem_cache_alloc_debug(kmem_cache_t *cp, void *buf)
{
kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
kmem_bufctl_t *bcp = btp->bt_bufctl;
if (btp->bt_bxstat != ((int)bcp ^ KMEM_BUFTAG_FREE)) {
kmem_error(KMERR_BADBUFTAG, cp, buf, bcp);
return;
}
btp->bt_bxstat = (int)bcp ^ KMEM_BUFTAG_ALLOC;
if ((cp->cache_flags & KMF_HASH) != 0 && bcp->bc_addr != buf) {
kmem_error(KMERR_BADBUFCTL, cp, buf, bcp);
return;
}
if (cp->cache_flags & KMF_REDZONE)
btp->bt_redzone[0] = KMEM_REDZONE_WORD;
if (cp->cache_flags & KMF_DEADBEEF) {
if (verify_pattern(0xdeadbeef, buf, cp->cache_offset) != NULL)
kmem_error(KMERR_MODIFIED, cp, buf, bcp);
copy_pattern(0xbaddcafe, buf, cp->cache_offset);
if (cp->cache_constructor != NULL)
(*cp->cache_constructor)(buf, cp->cache_bufsize);
}
if (cp->cache_flags & KMF_AUDIT)
kmem_audit((kmem_bufctl_audit_t *)bcp);
}
static int
kmem_cache_free_debug(kmem_cache_t *cp, void *buf)
{
kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
kmem_bufctl_t *bcp = btp->bt_bufctl;
if (btp->bt_bxstat != ((int)bcp ^ KMEM_BUFTAG_ALLOC)) {
if (btp->bt_bxstat == ((int)bcp ^ KMEM_BUFTAG_FREE)) {
kmem_error(KMERR_DUPFREE, cp, buf, bcp);
return (-1);
}
bcp = kmem_locate(buf);
if (bcp == NULL || bcp->bc_slab->slab_cache != cp)
kmem_error(KMERR_BADADDR, cp, buf, NULL);
else
kmem_error(KMERR_REDZONE, cp, buf, bcp);
return (-1);
}
btp->bt_bxstat = (int)bcp ^ KMEM_BUFTAG_FREE;
if ((cp->cache_flags & KMF_HASH) != 0 && bcp->bc_addr != buf) {
kmem_error(KMERR_BADBUFCTL, cp, buf, bcp);
return (-1);
}
if (cp->cache_flags & KMF_REDZONE) {
if (verify_pattern(KMEM_REDZONE_WORD, btp->bt_redzone,
KMEM_REDZONE_SIZE * sizeof (void *)) != NULL) {
kmem_error(KMERR_REDZONE, cp, buf, bcp);
return (-1);
}
}
if (cp->cache_flags & KMF_DEADBEEF) {
if (cp->cache_destructor != NULL)
(*cp->cache_destructor)(buf, cp->cache_bufsize);
copy_pattern(0xdeadbeef, buf, cp->cache_offset);
}
if (cp->cache_flags & KMF_AUDIT)
kmem_audit((kmem_bufctl_audit_t *)bcp);
return (0);
}
/*
* To make the magazine layer as fast as possible, we don't check for
* kmem debugging flags in production (non-DEBUG) kernels. We can get
* equivalent debugging functionality in the field by setting the
* KMF_NOMAGAZINE flag in addition to any others; that causes the
* allocator to bypass the magazine layer entirely and go straight
* to the global layer, which always checks the flags. This is not
* satisfactory for internal testing, however, because we also want
* to stress the magazine code itself; the #defines below enable
* magazine-layer debugging in DEBUG kernels.
*/
#ifdef DEBUG
#define KMEM_CACHE_ALLOC_DEBUG(cp, buf) \
if ((cp)->cache_flags & KMF_BUFTAG) \
kmem_cache_alloc_debug(cp, buf)
#define KMEM_CACHE_FREE_DEBUG(cp, buf) \
if ((cp)->cache_flags & KMF_BUFTAG) \
if (kmem_cache_free_debug(cp, buf)) \
return;
#else
#define KMEM_CACHE_ALLOC_DEBUG(cp, buf)
#define KMEM_CACHE_FREE_DEBUG(cp, buf)
#endif
static void *
kmem_cache_alloc_global(kmem_cache_t *cp, int flags)
{
void *buf;
kmem_slab_t *sp, *snext, *sprev;
kmem_bufctl_t *bcp, **hash_bucket;
TRACE_2(TR_FAC_KMEM, TR_KMEM_CACHE_ALLOC_START,
"kmem_cache_alloc_start:cache %S flags %x",
cp->cache_name, flags);
mutex_enter(&cp->cache_lock);
cp->cache_alloc++;
sp = cp->cache_freelist;
ASSERT(sp->slab_cache == cp);
if ((bcp = sp->slab_head) == sp->slab_tail) {
if (bcp == NULL) {
/*
* The freelist is empty. Create a new slab.
*/
if ((sp = kmem_slab_create(cp, flags)) == NULL) {
cp->cache_alloc_fail++;
mutex_exit(&cp->cache_lock);
TRACE_1(TR_FAC_KMEM, TR_KMEM_CACHE_ALLOC_END,
"kmem_cache_alloc_end:buf %x", NULL);
return (NULL);
}
/*
* Add slab to tail of freelist
*/
sp->slab_next = snext = &cp->cache_nullslab;
sp->slab_prev = sprev = snext->slab_prev;
snext->slab_prev = sp;
sprev->slab_next = sp;
if (cp->cache_freelist == snext)
cp->cache_freelist = sp;
sp = cp->cache_freelist;
}
/*
* If this is last buf in slab, remove slab from free list
*/
if ((bcp = sp->slab_head) == sp->slab_tail) {
cp->cache_freelist = sp->slab_next;
sp->slab_tail = NULL;
}
}
sp->slab_head = bcp->bc_next;
sp->slab_refcnt++;
ASSERT(sp->slab_refcnt <= sp->slab_chunks);
if (cp->cache_flags & KMF_HASH) {
/*
* add buf to allocated-address hash table
*/
buf = bcp->bc_addr;
hash_bucket = KMEM_HASH(cp, buf);
bcp->bc_next = *hash_bucket;
*hash_bucket = bcp;
} else {
buf = (void *)((int)bcp - cp->cache_offset);
}
ASSERT((u_int)buf - (u_int)sp->slab_base < cp->cache_slabsize);
mutex_exit(&cp->cache_lock);
if (cp->cache_flags & KMF_BUFTAG)
kmem_cache_alloc_debug(cp, buf);
TRACE_1(TR_FAC_KMEM, TR_KMEM_CACHE_ALLOC_END,
"kmem_cache_alloc_end:buf %x", buf);
return (buf);
}
static void
kmem_cache_free_global(kmem_cache_t *cp, void *buf)
{
kmem_slab_t *sp, *snext, *sprev;
kmem_bufctl_t *bcp, **prev_bcpp, *old_slab_tail;
TRACE_2(TR_FAC_KMEM, TR_KMEM_CACHE_FREE_START,
"kmem_cache_free_start:cache %S buf %x", cp->cache_name, buf);
ASSERT(buf != NULL);
if (cp->cache_flags & KMF_BUFTAG)
if (kmem_cache_free_debug(cp, buf))
return;
mutex_enter(&cp->cache_lock);
if (cp->cache_flags & KMF_HASH) {
/*
* look up buf in allocated-address hash table
*/
prev_bcpp = KMEM_HASH(cp, buf);
bcp = *prev_bcpp;
do {
if (bcp->bc_addr == buf)
goto lookup_success;
cp->cache_lookup_depth++;
prev_bcpp = &bcp->bc_next;
} while ((bcp = *prev_bcpp) != NULL);
mutex_exit(&cp->cache_lock);
kmem_error(KMERR_BADADDR, cp, buf, NULL);
return;
lookup_success:
/*
* remove buf from hash table
*/
*prev_bcpp = bcp->bc_next;
sp = bcp->bc_slab;
} else {
bcp = (kmem_bufctl_t *)((int)buf + cp->cache_offset);
sp = (kmem_slab_t *)((((int)buf) & -PAGESIZE) +
(PAGESIZE - sizeof (kmem_slab_t)));
}
ASSERT(sp->slab_cache == cp);
old_slab_tail = sp->slab_tail;
sp->slab_tail = bcp;
if (old_slab_tail == NULL) {
/*
* Return slab to head of free list
*/
sp->slab_head = bcp;
if ((snext = sp->slab_next) != cp->cache_freelist) {
snext->slab_prev = sprev = sp->slab_prev;
sprev->slab_next = snext;
sp->slab_next = snext = cp->cache_freelist;
sp->slab_prev = sprev = snext->slab_prev;
sprev->slab_next = sp;
snext->slab_prev = sp;
}
cp->cache_freelist = sp;
} else {
old_slab_tail->bc_next = bcp;
}
ASSERT(sp->slab_refcnt >= 1);
if (--sp->slab_refcnt == 0) {
/*
* There are no outstanding allocations from this slab,
* so we can reclaim the memory.
*/
snext = sp->slab_next;
sprev = sp->slab_prev;
snext->slab_prev = sprev;
sprev->slab_next = snext;
if (sp == cp->cache_freelist)
cp->cache_freelist = snext;
kmem_slab_destroy(cp, sp);
}
mutex_exit(&cp->cache_lock);
TRACE_0(TR_FAC_KMEM, TR_KMEM_CACHE_FREE_END, "kmem_cache_free_end");
}
void *
kmem_cache_alloc(kmem_cache_t *cp, int flags)
{
void *buf;
kmem_cpu_cache_t *ccp;
kmem_magazine_t *mp, *fmp;
TRACE_2(TR_FAC_KMEM, TR_KMEM_CACHE_ALLOC_START,
"kmem_cache_alloc_start:cache %S flags %x",
cp->cache_name, flags);
ccp = &cp->cache_cpu[CPU->cpu_seqid];
mutex_enter(&ccp->cc_lock);
mp = ccp->cc_loaded_mag;
if (ccp->cc_rounds > 0) {
ccp->cc_alloc++;
buf = mp->mag_round[--ccp->cc_rounds];
mutex_exit(&ccp->cc_lock);
TRACE_1(TR_FAC_KMEM, TR_KMEM_CACHE_ALLOC_END,
"kmem_cache_alloc_end:buf %x", buf);
KMEM_CACHE_ALLOC_DEBUG(cp, buf);
return (buf);
}
if ((fmp = ccp->cc_full_mag) != NULL) {
ASSERT(ccp->cc_empty_mag == NULL);
ccp->cc_alloc++;
ccp->cc_empty_mag = mp;
ccp->cc_loaded_mag = fmp;
ccp->cc_full_mag = NULL;
ccp->cc_rounds = ccp->cc_magsize - 1;
buf = fmp->mag_round[ccp->cc_magsize - 1];
mutex_exit(&ccp->cc_lock);
TRACE_1(TR_FAC_KMEM, TR_KMEM_CACHE_ALLOC_END,
"kmem_cache_alloc_end:buf %x", buf);
KMEM_CACHE_ALLOC_DEBUG(cp, buf);
return (buf);
}
if (ccp->cc_magsize > 0) {
if (!mutex_tryenter(&cp->cache_depot_lock)) {
mutex_enter(&cp->cache_depot_lock);
cp->cache_depot_contention++;
}
if ((fmp = cp->cache_fmag_list) != NULL) {
cp->cache_fmag_list = fmp->mag_next;
if (--cp->cache_fmag_total < cp->cache_fmag_min)
cp->cache_fmag_min = cp->cache_fmag_total;
if (mp != NULL) {
mp->mag_next = cp->cache_emag_list;
cp->cache_emag_list = mp;
cp->cache_emag_total++;
}
cp->cache_depot_alloc++;
mutex_exit(&cp->cache_depot_lock);
ccp->cc_loaded_mag = fmp;
ccp->cc_rounds = ccp->cc_magsize - 1;
buf = fmp->mag_round[ccp->cc_magsize - 1];
mutex_exit(&ccp->cc_lock);
TRACE_1(TR_FAC_KMEM, TR_KMEM_CACHE_ALLOC_END,
"kmem_cache_alloc_end:buf %x", buf);
KMEM_CACHE_ALLOC_DEBUG(cp, buf);
return (buf);
}
mutex_exit(&cp->cache_depot_lock);
}
mutex_exit(&ccp->cc_lock);
buf = kmem_cache_alloc_global(cp, flags);
TRACE_1(TR_FAC_KMEM, TR_KMEM_CACHE_ALLOC_END,
"kmem_cache_alloc_end:buf %x", buf);
return (buf);
}
void
kmem_cache_free(kmem_cache_t *cp, void *buf)
{
kmem_cpu_cache_t *ccp;
kmem_magazine_t *mp, *emp;
TRACE_2(TR_FAC_KMEM, TR_KMEM_CACHE_FREE_START,
"kmem_cache_free_start:cache %S buf %x", cp->cache_name, buf);
KMEM_CACHE_FREE_DEBUG(cp, buf);
ccp = &cp->cache_cpu[CPU->cpu_seqid];
mutex_enter(&ccp->cc_lock);
mp = ccp->cc_loaded_mag;
if ((u_int)ccp->cc_rounds < ccp->cc_magsize) {
ccp->cc_free++;
mp->mag_round[ccp->cc_rounds++] = buf;
mutex_exit(&ccp->cc_lock);
TRACE_0(TR_FAC_KMEM, TR_KMEM_CACHE_FREE_END,
"kmem_cache_free_end");
return;
}
if ((emp = ccp->cc_empty_mag) != NULL) {
ASSERT(ccp->cc_full_mag == NULL);
ccp->cc_free++;
ccp->cc_full_mag = mp;
ccp->cc_loaded_mag = emp;
ccp->cc_empty_mag = NULL;
ccp->cc_rounds = 1;
emp->mag_round[0] = buf;
mutex_exit(&ccp->cc_lock);
TRACE_0(TR_FAC_KMEM, TR_KMEM_CACHE_FREE_END,
"kmem_cache_free_end");
return;
}
if (ccp->cc_magsize > 0) {
if (!mutex_tryenter(&cp->cache_depot_lock)) {
mutex_enter(&cp->cache_depot_lock);
cp->cache_depot_contention++;
}
if ((emp = cp->cache_emag_list) != NULL) {
cp->cache_emag_list = emp->mag_next;
if (--cp->cache_emag_total < cp->cache_emag_min)
cp->cache_emag_min = cp->cache_emag_total;
}
if (emp != NULL || (emp =
kmem_cache_alloc_global(cp->cache_magazine_cache,
KM_NOSLEEP)) != NULL) {
if (mp != NULL) {
mp->mag_next = cp->cache_fmag_list;
cp->cache_fmag_list = mp;
cp->cache_fmag_total++;
}
cp->cache_depot_free++;
mutex_exit(&cp->cache_depot_lock);
ccp->cc_loaded_mag = emp;
ccp->cc_rounds = 1;
emp->mag_round[0] = buf;
mutex_exit(&ccp->cc_lock);
TRACE_0(TR_FAC_KMEM, TR_KMEM_CACHE_FREE_END,
"kmem_cache_free_end");
return;
}
mutex_exit(&cp->cache_depot_lock);
}
mutex_exit(&ccp->cc_lock);
KMEM_CACHE_ALLOC_DEBUG(cp, buf);
kmem_cache_free_global(cp, buf);
TRACE_0(TR_FAC_KMEM, TR_KMEM_CACHE_FREE_END, "kmem_cache_free_end");
}
void *
kmem_zalloc(size_t size, int flags)
{
void *buf;
int index = (int)(size - 1) >> KMEM_ALIGN_SHIFT;
TRACE_2(TR_FAC_KMEM, TR_KMEM_ZALLOC_START,
"kmem_zalloc_start:size %d flags %x", size, flags);
if ((u_int)index >= KMEM_MAXBUF >> KMEM_ALIGN_SHIFT) {
if ((buf = kmem_alloc(size, flags)) != NULL)
KMEM_BZERO(buf, size);
} else {
kmem_cache_t *cp = kmem_alloc_table[index];
kmem_cpu_cache_t *ccp = &cp->cache_cpu[CPU->cpu_seqid];
mutex_enter(&ccp->cc_lock);
if (ccp->cc_rounds > 0) {
ccp->cc_alloc++;
buf = ccp->cc_loaded_mag->mag_round[--ccp->cc_rounds];
mutex_exit(&ccp->cc_lock);
KMEM_CACHE_ALLOC_DEBUG(cp, buf);
KMEM_BZERO(buf, size);
} else {
mutex_exit(&ccp->cc_lock);
if ((buf = kmem_cache_alloc(cp, flags)) != NULL)
KMEM_BZERO(buf, size);
}
}
TRACE_1(TR_FAC_KMEM, TR_KMEM_ZALLOC_END, "kmem_zalloc_end:buf %x", buf);
return (buf);
}
void *
kmem_alloc(size_t size, int flags)
{
void *buf;
int index = (int)(size - 1) >> KMEM_ALIGN_SHIFT;
TRACE_2(TR_FAC_KMEM, TR_KMEM_ALLOC_START,
"kmem_alloc_start:size %d flags %x", size, flags);
if ((u_int)index >= KMEM_MAXBUF >> KMEM_ALIGN_SHIFT) {
if (size == 0) {
TRACE_1(TR_FAC_KMEM, TR_KMEM_ALLOC_END,
"kmem_alloc_end:buf %x", NULL);
return (NULL);
}
kmem_misc_kstat.huge_alloc.value.l++;
if ((buf = kmem_page_alloc(btopr(size), flags)) == NULL)
kmem_misc_kstat.huge_alloc_fail.value.l++;
else if (kmem_flags & KMF_DEADBEEF)
copy_pattern(0xbaddcafe, buf, size);
} else {
kmem_cache_t *cp = kmem_alloc_table[index];
kmem_cpu_cache_t *ccp = &cp->cache_cpu[CPU->cpu_seqid];
mutex_enter(&ccp->cc_lock);
if (ccp->cc_rounds > 0) {
ccp->cc_alloc++;
buf = ccp->cc_loaded_mag->mag_round[--ccp->cc_rounds];
mutex_exit(&ccp->cc_lock);
KMEM_CACHE_ALLOC_DEBUG(cp, buf);
} else {
mutex_exit(&ccp->cc_lock);
buf = kmem_cache_alloc(cp, flags);
}
}
TRACE_1(TR_FAC_KMEM, TR_KMEM_ALLOC_END, "kmem_alloc_end:buf %x", buf);
return (buf);
}
void
kmem_free(void *buf, size_t size)
{
int index = (size - 1) >> KMEM_ALIGN_SHIFT;
TRACE_2(TR_FAC_KMEM, TR_KMEM_FREE_START,
"kmem_free_start:buf %x size %d", buf, size);
if ((u_int)index >= KMEM_MAXBUF >> KMEM_ALIGN_SHIFT) {
if (buf == NULL && size == 0) {
TRACE_0(TR_FAC_KMEM, TR_KMEM_FREE_END, "kmem_free_end");
return;
}
if (buf == NULL || size == 0)
cmn_err(CE_PANIC, "kmem_free(%x, %d) impossible",
buf, size);
kmem_page_free(buf, btopr(size));
} else {
kmem_cache_t *cp = kmem_alloc_table[index];
kmem_cpu_cache_t *ccp = &cp->cache_cpu[CPU->cpu_seqid];
KMEM_CACHE_FREE_DEBUG(cp, buf);
mutex_enter(&ccp->cc_lock);
if ((u_int)ccp->cc_rounds < ccp->cc_magsize) {
ccp->cc_free++;
ccp->cc_loaded_mag->mag_round[ccp->cc_rounds++] = buf;
mutex_exit(&ccp->cc_lock);
} else {
mutex_exit(&ccp->cc_lock);
KMEM_CACHE_ALLOC_DEBUG(cp, buf);
kmem_cache_free(cp, buf);
}
}
TRACE_0(TR_FAC_KMEM, TR_KMEM_FREE_END, "kmem_free_end");
}
/* ARGSUSED */
void *
kmem_fast_zalloc(char **base, size_t size, int chunks, int flags)
{
void *buf;
TRACE_4(TR_FAC_KMEM, TR_KMEM_FAST_ZALLOC_START,
"kmem_fast_zalloc_start:base %K size %d chunks %d flags %x",
base, size, chunks, flags);
if ((buf = *base) == NULL) {
if (kmem_fast_debug) {
buf = kmem_perm_alloc(size + KMEM_ALIGN,
KMEM_ALIGN, flags);
if (buf != NULL) {
buf = (char *)buf + KMEM_ALIGN;
((int *)buf)[-1] = size;
copy_pattern(0xdeadbeef, buf, size);
}
} else {
buf = kmem_perm_alloc(size, KMEM_ALIGN, flags);
}
if (buf == NULL) {
TRACE_1(TR_FAC_KMEM, TR_KMEM_FAST_ZALLOC_END,
"kmem_fast_zalloc_end:buf %x", buf);
return (NULL);
}
} else {
*base = *(void **)buf;
}
if (kmem_fast_debug) {
*(void **)buf = (void *)0xdeadbeef;
ASSERT(((int *)buf)[-1] == size);
if (verify_pattern(0xdeadbeef, buf, size) != NULL)
kmem_error(KMERR_MODIFIED, NULL, buf, NULL);
}
KMEM_BZERO(buf, size);
TRACE_1(TR_FAC_KMEM, TR_KMEM_FAST_ZALLOC_END,
"kmem_fast_zalloc_end:buf %x", buf);
return (buf);
}
/* ARGSUSED */
void *
kmem_fast_alloc(char **base, size_t size, int chunks, int flags)
{
void *buf;
TRACE_4(TR_FAC_KMEM, TR_KMEM_FAST_ALLOC_START,
"kmem_fast_alloc_start:base %K size %d chunks %d flags %x",
base, size, chunks, flags);
if ((buf = *base) == NULL) {
if (kmem_fast_debug) {
buf = kmem_perm_alloc(size + KMEM_ALIGN,
KMEM_ALIGN, flags);
if (buf != NULL) {
buf = (char *)buf + KMEM_ALIGN;
((int *)buf)[-1] = size;
copy_pattern(0xdeadbeef, buf, size);
}
} else {
buf = kmem_perm_alloc(size, KMEM_ALIGN, flags);
}
if (buf == NULL) {
TRACE_1(TR_FAC_KMEM, TR_KMEM_FAST_ALLOC_END,
"kmem_fast_alloc_end:buf %x", buf);
return (NULL);
}
} else {
*base = *(void **)buf;
}
if (kmem_fast_debug) {
*(void **)buf = (void *)0xdeadbeef;
ASSERT(((int *)buf)[-1] == size);
if (verify_pattern(0xdeadbeef, buf, size) != NULL)
kmem_error(KMERR_MODIFIED, NULL, buf, NULL);
copy_pattern(0xbaddcafe, buf, size);
}
TRACE_1(TR_FAC_KMEM, TR_KMEM_FAST_ALLOC_END,
"kmem_fast_alloc_end:buf %x", buf);
return (buf);
}
void
kmem_fast_free(char **base, void *buf)
{
TRACE_2(TR_FAC_KMEM, TR_KMEM_FAST_FREE_START,
"kmem_fast_free_start:base %K buf %x", base, buf);
if (kmem_fast_debug)
copy_pattern(0xdeadbeef, buf, ((int *)buf)[-1]);
*(void **)buf = *base;
*base = buf;
TRACE_0(TR_FAC_KMEM, TR_KMEM_FAST_FREE_END, "kmem_fast_free_end");
}
static void
kmem_magazine_destroy_one(kmem_cache_t *cp, kmem_magazine_t *mp, int nrounds)
{
if (mp != NULL) {
int round;
for (round = 0; round < nrounds; round++) {
KMEM_CACHE_ALLOC_DEBUG(cp, mp->mag_round[round]);
kmem_cache_free_global(cp, mp->mag_round[round]);
}
kmem_cache_free_global(cp->cache_magazine_cache, mp);
}
}
static void
kmem_magazine_destroy(kmem_cache_t *cp, kmem_magazine_t *mp, int nrounds)
{
while (mp != NULL) {
kmem_magazine_t *nextmp = mp->mag_next;
kmem_magazine_destroy_one(cp, mp, nrounds);
mp = nextmp;
}
}
static void
kmem_async_dispatch(void (*func)(kmem_cache_t *), kmem_cache_t *cp, int flags)
{
kmem_async_t *ap, *anext, *aprev;
TRACE_3(TR_FAC_KMEM, TR_KMEM_ASYNC_DISPATCH_START,
"kmem_async_dispatch_start:%K(%S) flags %x",
func, cp->cache_name, flags);
mutex_enter(&kmem_async_lock);
if ((ap = kmem_async_freelist) == NULL) {
mutex_exit(&kmem_async_lock);
if ((ap = kmem_perm_alloc(sizeof (*ap), 0, flags)) == NULL)
return;
mutex_enter(&kmem_async_lock);
} else {
kmem_async_freelist = ap->async_next;
}
ap->async_next = anext = &kmem_async_queue;
ap->async_prev = aprev = kmem_async_queue.async_prev;
aprev->async_next = ap;
anext->async_prev = ap;
ap->async_func = func;
ap->async_cache = cp;
cv_signal(&kmem_async_cv);
mutex_exit(&kmem_async_lock);
TRACE_1(TR_FAC_KMEM, TR_KMEM_ASYNC_DISPATCH_END,
"kmem_async_dispatch_end:async_entry %x", ap);
}
/*
* Reclaim all unused memory from a cache.
*/
static void
kmem_cache_reap(kmem_cache_t *cp)
{
int reaplimit, magsize;
kmem_magazine_t *mp, *fmp, *reaplist;
kmem_cpu_cache_t *ccp;
void *buf = NULL;
ASSERT(MUTEX_HELD(&kmem_async_serialize));
/*
* Ask the cache's owner to free some memory if possible.
* The idea is to handle things like the inode cache, which
* typically sits on a bunch of memory that it doesn't truly
* *need*. Reclaim policy is entirely up to the owner; this
* callback is just an advisory plea for help.
*/
if (cp->cache_reclaim != NULL)
(*cp->cache_reclaim)();
/*
* We want to ensure that unused buffers scattered across multiple
* cpus will eventually coalesce into complete slabs. Each time
* this routine runs it selects a victim cpu, returns its full
* magazine to the depot, and returns one buffer from its loaded
* magazine to the global layer. Eventually all unused memory
* is reclaimed.
*/
mutex_enter(&cp->cache_depot_lock);
if (++cp->cache_cpu_rotor >= ncpus)
cp->cache_cpu_rotor = 0;
ccp = &cp->cache_cpu[cp->cache_cpu_rotor];
mutex_exit(&cp->cache_depot_lock);
mutex_enter(&ccp->cc_lock);
if ((fmp = ccp->cc_full_mag) != NULL) {
ccp->cc_full_mag = NULL;
mutex_enter(&cp->cache_depot_lock);
fmp->mag_next = cp->cache_fmag_list;
cp->cache_fmag_list = fmp;
cp->cache_fmag_total++;
mutex_exit(&cp->cache_depot_lock);
}
if (ccp->cc_rounds > 0) {
ccp->cc_alloc++;
buf = ccp->cc_loaded_mag->mag_round[--ccp->cc_rounds];
}
mutex_exit(&ccp->cc_lock);
if (buf != NULL) {
KMEM_CACHE_ALLOC_DEBUG(cp, buf);
kmem_cache_free_global(cp, buf);
}
mutex_enter(&cp->cache_depot_lock);
reaplimit = min(cp->cache_fmag_reaplimit, cp->cache_fmag_min);
reaplist = NULL;
cp->cache_fmag_total -= reaplimit;
cp->cache_fmag_min -= reaplimit;
cp->cache_fmag_reaplimit = 0;
while (--reaplimit >= 0) {
mp = cp->cache_fmag_list;
cp->cache_fmag_list = mp->mag_next;
mp->mag_next = reaplist;
reaplist = mp;
}
magsize = cp->cache_magazine_size;
mutex_exit(&cp->cache_depot_lock);
kmem_magazine_destroy(cp, reaplist, magsize);
mutex_enter(&cp->cache_depot_lock);
reaplimit = min(cp->cache_emag_reaplimit, cp->cache_emag_min);
reaplist = NULL;
cp->cache_emag_total -= reaplimit;
cp->cache_emag_min -= reaplimit;
cp->cache_emag_reaplimit = 0;
while (--reaplimit >= 0) {
mp = cp->cache_emag_list;
cp->cache_emag_list = mp->mag_next;
mp->mag_next = reaplist;
reaplist = mp;
}
mutex_exit(&cp->cache_depot_lock);
kmem_magazine_destroy(cp, reaplist, 0);
}
/*
* Reclaim all unused memory from all caches. Called from the VM system
* when memory gets tight.
*/
void
kmem_reap(void)
{
kmem_cache_t *cp;
if ((int)(lbolt - kmem_reap_lasttime) < kmem_reap_interval)
return;
kmem_reap_lasttime = lbolt;
mutex_enter(&kmem_cache_lock);
for (cp = kmem_null_cache.cache_next; cp != &kmem_null_cache;
cp = cp->cache_next)
kmem_async_dispatch(kmem_cache_reap, cp, KM_NOSLEEP);
mutex_exit(&kmem_cache_lock);
}
/*
* Purge all magazines from a cache and set its magazine limit to zero.
* All calls are serialized by kmem_async_serialize.
*/
static void
kmem_cache_magazine_purge(kmem_cache_t *cp)
{
kmem_cpu_cache_t *ccp;
kmem_magazine_t *mp, *fmp, *emp;
int rounds, magsize, cpu_seqid;
ASSERT(MUTEX_HELD(&kmem_async_serialize));
ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
for (cpu_seqid = 0; cpu_seqid < ncpus; cpu_seqid++) {
ccp = &cp->cache_cpu[cpu_seqid];
mutex_enter(&ccp->cc_lock);
rounds = ccp->cc_rounds;
magsize = ccp->cc_magsize;
mp = ccp->cc_loaded_mag;
fmp = ccp->cc_full_mag;
emp = ccp->cc_empty_mag;
ccp->cc_rounds = -1;
ccp->cc_magsize = 0;
ccp->cc_loaded_mag = NULL;
ccp->cc_full_mag = NULL;
ccp->cc_empty_mag = NULL;
mutex_exit(&ccp->cc_lock);
kmem_magazine_destroy_one(cp, mp, rounds);
kmem_magazine_destroy_one(cp, fmp, magsize);
kmem_magazine_destroy_one(cp, emp, 0);
}
mutex_enter(&cp->cache_depot_lock);
cp->cache_fmag_min = cp->cache_fmag_reaplimit = cp->cache_fmag_total;
cp->cache_emag_min = cp->cache_emag_reaplimit = cp->cache_emag_total;
mutex_exit(&cp->cache_depot_lock);
kmem_cache_reap(cp);
}
/*
* Recompute a cache's magazine size. The trade-off is that larger magazines
* provide a higher transfer rate with the global layer, while smaller
* magazines reduce memory consumption. Magazine resizing is an expensive
* operation; it should not be done frequently. Changes to the magazine size
* are serialized by kmem_async_serialize.
*
* Note: at present this only grows the magazine size. It might be useful
* to allow shrinkage too.
*/
static void
kmem_magazine_resize(kmem_cache_t *cp)
{
kmem_cpu_cache_t *ccp;
int cpu_seqid;
kmem_magazine_type_t *mtp;
ASSERT(MUTEX_HELD(&kmem_async_serialize));
mutex_enter(&cp->cache_depot_lock);
if (cp->cache_magazine_size == cp->cache_magazine_maxsize) {
mutex_exit(&cp->cache_depot_lock);
return;
}
mtp = kmem_magazine_type;
while (mtp->mt_magsize <= cp->cache_magazine_size)
mtp++;
mutex_exit(&cp->cache_depot_lock);
kmem_cache_magazine_purge(cp);
mutex_enter(&cp->cache_depot_lock);
cp->cache_magazine_cache = mtp->mt_cache;
cp->cache_magazine_size = mtp->mt_magsize;
mutex_exit(&cp->cache_depot_lock);
for (cpu_seqid = 0; cpu_seqid < ncpus; cpu_seqid++) {
ccp = &cp->cache_cpu[cpu_seqid];
mutex_enter(&ccp->cc_lock);
ccp->cc_magsize = mtp->mt_magsize;
mutex_exit(&ccp->cc_lock);
}
/*
* Recalibrate contention count -- we don't want any previous
* contention to be counted against the new magazine size.
*/
mutex_enter(&cp->cache_depot_lock);
cp->cache_depot_contention_last = cp->cache_depot_contention;
mutex_exit(&cp->cache_depot_lock);
}
/*
* Rescale a cache's hash table, so that the table size is roughly the
* cache size. We want the average lookup time to be extremely small.
*/
static void
kmem_hash_rescale(kmem_cache_t *cp)
{
int old_size, new_size, h, rescale;
kmem_bufctl_t **old_table, **new_table, *bcp;
ASSERT(MUTEX_HELD(&kmem_async_serialize));
TRACE_2(TR_FAC_KMEM, TR_KMEM_HASH_RESCALE_START,
"kmem_hash_rescale_start:cache %S buftotal %d",
cp->cache_name, cp->cache_buftotal);
new_size = max(KMEM_MIN_HASH_SIZE,
1 << (highbit(3 * cp->cache_buftotal + 4) - 2));
new_table = kmem_alloc(new_size * sizeof (kmem_bufctl_t *), KM_NOSLEEP);
if (new_table == NULL)
return;
copy_pattern((u_long)&kmem_null_bufctl, new_table,
new_size * sizeof (kmem_bufctl_t *));
mutex_enter(&cp->cache_lock);
old_size = cp->cache_hash_mask + 1;
old_table = cp->cache_hash_table;
cp->cache_hash_mask = new_size - 1;
cp->cache_hash_table = new_table;
for (h = 0; h < old_size; h++) {
bcp = old_table[h];
while (bcp != &kmem_null_bufctl) {
void *addr = bcp->bc_addr;
kmem_bufctl_t *next_bcp = bcp->bc_next;
kmem_bufctl_t **hash_bucket = KMEM_HASH(cp, addr);
bcp->bc_next = *hash_bucket;
*hash_bucket = bcp;
bcp = next_bcp;
}
}
rescale = cp->cache_rescale++;
mutex_exit(&cp->cache_lock);
if (rescale == 0) /* if old_table is the initial table */
kmem_page_free(old_table, 1);
else
kmem_free(old_table, old_size * sizeof (kmem_bufctl_t *));
TRACE_0(TR_FAC_KMEM, TR_KMEM_HASH_RESCALE_END, "kmem_hash_rescale_end");
}
/*
* Perform periodic maintenance on a cache: hash rescaling,
* depot working-set update, and magazine resizing.
*/
static void
kmem_cache_update(kmem_cache_t *cp)
{
mutex_enter(&cp->cache_lock);
/*
* If the cache has become much larger or smaller than its hash table,
* fire off a request to rescale the hash table.
*/
if ((cp->cache_flags & KMF_HASH) &&
(cp->cache_buftotal > (cp->cache_hash_mask << 1) ||
(cp->cache_buftotal < (cp->cache_hash_mask >> 1) &&
cp->cache_hash_mask > KMEM_MIN_HASH_SIZE)))
kmem_async_dispatch(kmem_hash_rescale, cp, KM_NOSLEEP);
mutex_enter(&cp->cache_depot_lock);
/*
* Update the depot working set sizes
*/
cp->cache_fmag_reaplimit = cp->cache_fmag_min;
cp->cache_fmag_min = cp->cache_fmag_total;
cp->cache_emag_reaplimit = cp->cache_emag_min;
cp->cache_emag_min = cp->cache_emag_total;
/*
* If there's a lot of contention in the depot,
* increase the magazine size.
*/
if (cp->cache_magazine_size < cp->cache_magazine_maxsize &&
cp->cache_depot_contention - cp->cache_depot_contention_last >
kmem_depot_contention)
kmem_async_dispatch(kmem_magazine_resize, cp, KM_NOSLEEP);
cp->cache_depot_contention_last = cp->cache_depot_contention;
mutex_exit(&cp->cache_depot_lock);
mutex_exit(&cp->cache_lock);
}
static void
kmem_update(void *dummy)
{
kmem_cache_t *cp;
mutex_enter(&kmem_cache_lock);
for (cp = kmem_null_cache.cache_next; cp != &kmem_null_cache;
cp = cp->cache_next)
kmem_cache_update(cp);
mutex_exit(&kmem_cache_lock);
/*
* XXX -- Check to see if the system is out of kernelmap.
* This gets an 'XXX' because the allocator shouldn't
* know about such things. This will go away when I add
* pluggable back-end page supply vectors.
*/
if (mapwant(kernelmap))
kmem_reap();
(void) timeout(kmem_update, dummy, kmem_reap_interval);
}
static int
kmem_cache_kstat_update(kstat_t *ksp, int rw)
{
struct kmem_cache_kstat *kmcp;
kmem_cache_t *cp;
kmem_slab_t *sp;
int cpu_buf_avail;
int buf_avail = 0;
int cpu_seqid;
if (rw == KSTAT_WRITE)
return (EACCES);
kmcp = kmem_cache_kstat;
cp = ksp->ks_private;
kmcp->kmc_alloc_fail.value.l = cp->cache_alloc_fail;
kmcp->kmc_alloc.value.l = cp->cache_alloc;
kmcp->kmc_global_alloc.value.l = cp->cache_alloc;
for (cpu_seqid = 0; cpu_seqid < ncpus; cpu_seqid++) {
kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid];
struct kmem_cpu_kstat *kmcpup = &kmem_cpu_kstat[cpu_seqid];
mutex_enter(&ccp->cc_lock);
cpu_buf_avail = 0;
if (ccp->cc_rounds > 0)
cpu_buf_avail += ccp->cc_rounds;
if (ccp->cc_full_mag)
cpu_buf_avail += ccp->cc_magsize;
kmcpup->kmcpu_alloc_from.value.l = ccp->cc_alloc;
kmcpup->kmcpu_free_to.value.l = ccp->cc_free;
kmcpup->kmcpu_buf_avail.value.l = cpu_buf_avail;
kmcp->kmc_alloc.value.l += ccp->cc_alloc;
buf_avail += cpu_buf_avail;
mutex_exit(&ccp->cc_lock);
}
mutex_enter(&cp->cache_depot_lock);
kmcp->kmc_depot_alloc.value.l = cp->cache_depot_alloc;
kmcp->kmc_depot_free.value.l = cp->cache_depot_free;
kmcp->kmc_depot_contention.value.l = cp->cache_depot_contention;
kmcp->kmc_empty_magazines.value.l = cp->cache_emag_total;
kmcp->kmc_full_magazines.value.l = cp->cache_fmag_total;
kmcp->kmc_magazine_size.value.l = cp->cache_magazine_size;
kmcp->kmc_alloc.value.l += cp->cache_depot_alloc;
buf_avail += cp->cache_fmag_total * cp->cache_magazine_size;
mutex_exit(&cp->cache_depot_lock);
kmcp->kmc_buf_size.value.l = cp->cache_bufsize;
kmcp->kmc_align.value.l = cp->cache_align;
kmcp->kmc_chunk_size.value.l = cp->cache_chunksize;
kmcp->kmc_slab_size.value.l = cp->cache_slabsize;
for (sp = cp->cache_freelist; sp != &cp->cache_nullslab;
sp = sp->slab_next)
buf_avail += sp->slab_chunks - sp->slab_refcnt;
kmcp->kmc_buf_avail.value.l = buf_avail;
kmcp->kmc_buf_total.value.l = cp->cache_buftotal;
kmcp->kmc_buf_max.value.l = cp->cache_bufmax;
kmcp->kmc_slab_create.value.l = cp->cache_slab_create;
kmcp->kmc_slab_destroy.value.l = cp->cache_slab_destroy;
kmcp->kmc_hash_size.value.l = (cp->cache_hash_mask + 1) & -2;
kmcp->kmc_hash_lookup_depth.value.l = cp->cache_lookup_depth;
kmcp->kmc_hash_rescale.value.l = cp->cache_rescale;
return (0);
}
static void
kmem_cache_create_finish(kmem_cache_t *cp)
{
ASSERT(MUTEX_HELD(&kmem_async_serialize));
if (cp->cache_ncpus > ncpus) {
/*
* We over-allocated cache_cpu[] in kmem_cache_create()
* because we didn't know ncpus at that time. Now we do
* so return the unused memory to kmem_perm_freelist.
*/
int size, cpu_seqid;
for (cpu_seqid = ncpus; cpu_seqid < cp->cache_ncpus;
cpu_seqid++)
mutex_destroy(&cp->cache_cpu[cpu_seqid].cc_lock);
size = (cp->cache_ncpus - ncpus) * sizeof (kmem_cpu_cache_t);
if (size >= sizeof (kmem_perm_t) + KMEM_PERM_MINFREE) {
char *buf = (char *)&cp->cache_cpu[ncpus];
kmem_perm_t *pp = (kmem_perm_t *)buf;
mutex_enter(&kmem_perm_lock);
kmem_misc_kstat.perm_size.value.l += size;
pp->perm_next = kmem_perm_freelist;
pp->perm_current = buf + sizeof (kmem_perm_t);
pp->perm_avail = size - sizeof (kmem_perm_t);
kmem_perm_freelist = pp;
mutex_exit(&kmem_perm_lock);
}
cp->cache_ncpus = ncpus;
}
if ((cp->cache_kstat = kstat_create("unix", 0, cp->cache_name,
"kmem_cache", KSTAT_TYPE_NAMED,
sizeof (struct kmem_cache_kstat) / sizeof (kstat_named_t) +
sizeof (struct kmem_cpu_kstat) * ncpus / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL)) != NULL) {
cp->cache_kstat->ks_data = kmem_cache_kstat;
cp->cache_kstat->ks_update = kmem_cache_kstat_update;
cp->cache_kstat->ks_private = cp;
cp->cache_kstat->ks_lock = &cp->cache_lock;
kstat_install(cp->cache_kstat);
}
}
kmem_cache_t *
kmem_cache_create(
char *name, /* descriptive name for this cache */
size_t bufsize, /* size of the objects it manages */
int align, /* required object alignment */
void (*constructor)(void *, size_t), /* object constructor */
void (*destructor)(void *, size_t), /* object destructor */
void (*reclaim)(void)) /* memory reclaim callback */
{
int cpu_seqid, slabsize, chunksize, offset, maxcolor, raw;
kmem_cache_t *cp, *cnext, *cprev;
kmem_magazine_type_t *mtp;
char namebuf[128];
mutex_enter(&kmem_cache_lock);
if ((cp = kmem_cache_freelist) == NULL) {
mutex_exit(&kmem_cache_lock);
cp = kmem_perm_alloc((KMEM_CACHE_SIZE(max_ncpus) +
KMEM_CPU_CACHE_SIZE - 1) & -KMEM_CPU_CACHE_SIZE,
KMEM_CPU_CACHE_SIZE, KM_SLEEP);
/*
* Make sure that cp->cache_cpu[0] is aligned on a
* KMEM_CPU_CACHE_SIZE-byte boundary. The idea is
* to keep per-cpu data on different cache lines.
*/
cp = (kmem_cache_t *)((int)cp +
(-KMEM_CACHE_SIZE(0) & (KMEM_CPU_CACHE_SIZE)));
bzero((char *)cp, KMEM_CACHE_SIZE(max_ncpus));
cp->cache_ncpus = max_ncpus;
} else {
kmem_cache_freelist = cp->cache_next;
mutex_exit(&kmem_cache_lock);
bzero((char *)cp, KMEM_CACHE_SIZE(ncpus));
cp->cache_ncpus = ncpus;
}
strncpy(cp->cache_name, name, KMEM_CACHE_NAMELEN);
mutex_init(&cp->cache_lock, name, MUTEX_DEFAULT, NULL);
sprintf(namebuf, "%s_depot", name);
mutex_init(&cp->cache_depot_lock, namebuf, MUTEX_DEFAULT, NULL);
cv_init(&cp->cache_cv, name, CV_DEFAULT, NULL);
if (align < kmem_align)
align = kmem_align;
if ((align & (align - 1)) || align > PAGESIZE)
cmn_err(CE_PANIC, "kmem_cache_create: bad alignment %d", align);
cp->cache_bufsize = bufsize;
cp->cache_align = align;
cp->cache_constructor = constructor;
cp->cache_destructor = destructor;
cp->cache_reclaim = reclaim;
chunksize = (bufsize + (KMEM_ALIGN - 1)) & -KMEM_ALIGN;
raw = ((chunksize + KMEM_ALIGN + align) <= kmem_maxraw);
if (raw && (constructor || destructor)) {
if (chunksize - bufsize >= sizeof (void *)) {
offset = chunksize - sizeof (void *);
} else {
offset = chunksize;
chunksize += KMEM_ALIGN;
}
} else {
offset = chunksize - KMEM_ALIGN;
}
if (kmem_flags & KMF_BUFTAG) {
offset = (bufsize + (sizeof (void *) - 1)) & -(sizeof (void *));
chunksize = offset + sizeof (kmem_buftag_t);
}
chunksize = (chunksize + align - 1) & -align;
cp->cache_chunksize = chunksize;
cp->cache_offset = offset;
cp->cache_flags = kmem_flags | KMF_READY;
cp->cache_freelist = &cp->cache_nullslab;
cp->cache_nullslab.slab_cache = cp;
cp->cache_nullslab.slab_refcnt = -1;
cp->cache_nullslab.slab_next = &cp->cache_nullslab;
cp->cache_nullslab.slab_prev = &cp->cache_nullslab;
mtp = kmem_magazine_type;
while (chunksize <= mtp->mt_maxbuf)
mtp++;
if (!(kmem_flags & KMF_NOMAGAZINE) && mtp->mt_magsize != 0) {
cp->cache_magazine_maxsize = mtp->mt_magsize;
mtp = kmem_magazine_type;
while (chunksize <= mtp->mt_minbuf)
mtp++;
cp->cache_magazine_size = mtp->mt_magsize;
cp->cache_magazine_cache = mtp->mt_cache;
}
if (raw) {
slabsize = PAGESIZE;
maxcolor = (slabsize - sizeof (kmem_slab_t)) % chunksize;
} else {
int chunks, bestfit, waste, minwaste = INT_MAX;
for (chunks = 1; chunks <= KMEM_VOID_FRACTION; chunks++) {
slabsize = ptob(btopr(chunksize * chunks));
chunks = slabsize / chunksize;
maxcolor = slabsize % chunksize;
waste = maxcolor / chunks;
if (waste < minwaste) {
minwaste = waste;
bestfit = slabsize;
}
}
slabsize = bestfit;
maxcolor = slabsize % chunksize;
}
if (kmem_flags & KMF_NOCOLOR)
maxcolor = 0;
cp->cache_slabsize = slabsize;
cp->cache_color = (maxcolor >> 1) & -align;
cp->cache_maxcolor = maxcolor;
if (raw) {
/*
* Buffer auditing can't be applied to raw caches
*/
cp->cache_flags &= ~KMF_AUDIT;
} else {
cp->cache_hash_table = kmem_page_alloc(1, KM_SLEEP);
cp->cache_hash_mask = PAGESIZE / sizeof (void *) - 1;
cp->cache_hash_shift = highbit((u_long)chunksize) - 1;
copy_pattern((u_long)&kmem_null_bufctl,
cp->cache_hash_table, PAGESIZE);
cp->cache_flags |= KMF_HASH;
kmem_async_dispatch(kmem_hash_rescale, cp, KM_SLEEP);
}
for (cpu_seqid = 0; cpu_seqid < cp->cache_ncpus; cpu_seqid++) {
kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid];
sprintf(namebuf, "%s_cpu_seqid_%d", name, cpu_seqid);
mutex_init(&ccp->cc_lock, namebuf, MUTEX_ADAPTIVE, NULL);
ccp->cc_magsize = cp->cache_magazine_size;
ccp->cc_rounds = -1; /* no current magazine */
}
mutex_enter(&kmem_cache_lock);
cp->cache_next = cnext = &kmem_null_cache;
cp->cache_prev = cprev = kmem_null_cache.cache_prev;
cnext->cache_prev = cp;
cprev->cache_next = cp;
mutex_exit(&kmem_cache_lock);
/*
* We can't quite finish creating the cache now because caches can
* be created very early in the life of the system. Specifically
* kmem_cache_create() can be called before the total number of
* cpus is known and before the kstat framework is initialized.
* However, the cache *is* usable at this point, so we can just
* direct the kmem async thread (which doesn't come to life until
* ncpus is known and kstats are working) to apply the finishing
* touches later, when it's safe to do so.
*/
kmem_async_dispatch(kmem_cache_create_finish, cp, KM_SLEEP);
return (cp);
}
static void
kmem_cache_destroy_finish(kmem_cache_t *cp)
{
int cpu_seqid;
ASSERT(MUTEX_HELD(&kmem_async_serialize));
if (cp->cache_kstat)
kstat_delete(cp->cache_kstat);
if (cp->cache_flags & KMF_HASH) {
if (cp->cache_rescale == 0) /* initial hash table */
kmem_page_free(cp->cache_hash_table, 1);
else
kmem_free(cp->cache_hash_table, sizeof (void *) *
(cp->cache_hash_mask + 1));
}
for (cpu_seqid = 0; cpu_seqid < ncpus; cpu_seqid++)
mutex_destroy(&cp->cache_cpu[cpu_seqid].cc_lock);
cv_destroy(&cp->cache_cv);
mutex_destroy(&cp->cache_depot_lock);
mutex_destroy(&cp->cache_lock);
mutex_enter(&kmem_cache_lock);
cp->cache_next = kmem_cache_freelist;
kmem_cache_freelist = cp;
mutex_exit(&kmem_cache_lock);
}
void
kmem_cache_destroy(kmem_cache_t *cp)
{
/*
* Remove the cache from the global cache list (so that no one else
* can schedule async events on its behalf), purge the cache, and
* then destroy it. Since the async thread processes requests in
* FIFO order we can assume that kmem_cache_destroy_finish() will
* not be invoked until all other pending async events for this
* cache have completed and the cache is empty.
*
* Note that we *must* purge the cache synchonously because we have
* pointers to the caller's constructor, destructor, and reclaim
* routines. These can either go away (via module unloading) or
* cease to make sense (by referring to destroyed client state)
* as soon as kmem_cache_destroy() returns control to the caller.
*/
mutex_enter(&kmem_cache_lock);
cp->cache_prev->cache_next = cp->cache_next;
cp->cache_next->cache_prev = cp->cache_prev;
mutex_exit(&kmem_cache_lock);
mutex_enter(&kmem_async_serialize); /* lock out the async thread */
kmem_cache_magazine_purge(cp);
mutex_enter(&cp->cache_lock);
if (cp->cache_buftotal != 0)
cmn_err(CE_PANIC, "kmem_cache_destroy: '%s' (%x) not empty",
cp->cache_name, (int)cp);
cp->cache_reclaim = NULL;
/*
* The cache is now dead. There should be no further activity.
* We enforce this by setting land mines in the constructor and
* destructor routines that induce a kernel text fault if invoked.
*/
cp->cache_constructor = (void (*)(void *, size_t))1;
cp->cache_destructor = (void (*)(void *, size_t))2;
mutex_exit(&cp->cache_lock);
mutex_exit(&kmem_async_serialize);
kmem_async_dispatch(kmem_cache_destroy_finish, cp, KM_SLEEP);
}
void
kmem_async_thread(void)
{
kmem_async_t *ap, *anext, *aprev;
kstat_t *ksp;
cpu_t *cpup;
int nk;
kmem_cache_kstat = kmem_perm_alloc(sizeof (struct kmem_cache_kstat) +
ncpus * sizeof (struct kmem_cpu_kstat), 0, KM_SLEEP);
kmem_cpu_kstat = (void *)(kmem_cache_kstat + 1);
bcopy((void *)&kmem_cache_kstat_template, (void *)kmem_cache_kstat,
sizeof (struct kmem_cache_kstat));
mutex_enter(&cpu_lock);
cpup = cpu_list;
do {
kstat_named_t *src = (void *)&kmem_cpu_kstat_template;
kstat_named_t *dst = (void *)&kmem_cpu_kstat[cpup->cpu_seqid];
bcopy((void *)src, (void *)dst,
sizeof (kmem_cpu_kstat_template));
nk = sizeof (struct kmem_cpu_kstat) / sizeof (kstat_named_t);
while (--nk >= 0)
sprintf((dst++)->name, (src++)->name, cpup->cpu_id);
cpup = cpup->cpu_next;
} while (cpup != cpu_list);
mutex_exit(&cpu_lock);
if ((ksp = kstat_create("unix", 0, "kmem_misc", "kmem",
KSTAT_TYPE_NAMED, sizeof (kmem_misc_kstat) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL)) != NULL) {
ksp->ks_data = &kmem_misc_kstat;
kstat_install(ksp);
}
(void) timeout(kmem_update, 0, kmem_reap_interval);
mutex_enter(&kmem_async_lock);
for (;;) {
while (kmem_async_queue.async_next == &kmem_async_queue)
cv_wait(&kmem_async_cv, &kmem_async_lock);
ap = kmem_async_queue.async_next;
anext = ap->async_next;
aprev = ap->async_prev;
aprev->async_next = anext;
anext->async_prev = aprev;
mutex_exit(&kmem_async_lock);
TRACE_3(TR_FAC_KMEM, TR_KMEM_ASYNC_SERVICE_START,
"kmem_async_service_start:async_entry %x %K(%S)",
ap, ap->async_func, ap->async_cache->cache_name);
mutex_enter(&kmem_async_serialize);
(*ap->async_func)(ap->async_cache);
mutex_exit(&kmem_async_serialize);
TRACE_0(TR_FAC_KMEM, TR_KMEM_ASYNC_SERVICE_END,
"kmem_async_service_end");
mutex_enter(&kmem_async_lock);
ap->async_next = kmem_async_freelist;
kmem_async_freelist = ap;
}
}
void
kmem_init(void)
{
int i, size, cache_size;
kmem_cache_t *cp;
kmem_magazine_type_t *mtp;
#ifdef DEBUG
/*
* Hack to deal with suninstall brokenness.
* Suninstall has to run in 16MB of *total virtual memory*, so if
* we've got less than 20MB, don't do memory-intensive kmem debugging.
*/
if ((physmem >> (20 - PAGESHIFT)) < 20)
kmem_flags = 0;
#endif
mutex_init(&kmem_cache_lock, "kmem_cache_lock", MUTEX_DEFAULT, NULL);
mutex_init(&kmem_perm_lock, "kmem_perm_lock", MUTEX_DEFAULT, NULL);
cv_init(&kmem_async_cv, "kmem_async_cv", CV_DEFAULT, NULL);
mutex_init(&kmem_async_serialize, "kmem_async_serialize",
MUTEX_DEFAULT, NULL);
kmem_async_queue.async_next = &kmem_async_queue;
kmem_async_queue.async_prev = &kmem_async_queue;
if (kmem_maxraw == -1) /* i.e., if not patched */
kmem_maxraw = PAGESIZE / KMEM_VOID_FRACTION - 1;
/*
* We stick the kmem_null_bufctl struct on the tail of every
* hash chain, so that we don't have to check for a NULL
* pointer on the first lookup (which almost always succeeds).
*/
kmem_null_bufctl.bc_addr = (void *)0xdefec8ed;
kmem_null_cache.cache_next = &kmem_null_cache;
kmem_null_cache.cache_prev = &kmem_null_cache;
kmem_maxraw += PAGESIZE; /* magazine, slab, bufctl MUST be raw */
kmem_magazine_type[0].mt_maxbuf -= INT_MAX; /* no mags for mags */
for (i = 1; i < sizeof (kmem_magazine_type) / sizeof (*mtp); i++) {
char namebuf[64];
mtp = &kmem_magazine_type[i];
sprintf(namebuf, "kmem_magazine_%d", mtp->mt_magsize);
mtp->mt_cache = kmem_cache_create(namebuf,
(mtp->mt_magsize + 1) * sizeof (void *),
mtp->mt_align, NULL, NULL, NULL);
}
kmem_magazine_type[0].mt_maxbuf += INT_MAX;
kmem_slab_cache = kmem_cache_create("kmem_slab_cache",
sizeof (kmem_slab_t), 4 * sizeof (int), NULL, NULL, NULL);
kmem_bufctl_cache = kmem_cache_create("kmem_bufctl_cache",
(kmem_flags & KMF_AUDIT) ?
sizeof (kmem_bufctl_audit_t) : sizeof (kmem_bufctl_t),
4 * sizeof (int), NULL, NULL, NULL);
kmem_maxraw -= PAGESIZE;
kmem_reap_interval = 15 * HZ;
kmem_depot_contention = (KMEM_DEPOT_CONTENTION * kmem_reap_interval) /
(60 * HZ);
/*
* Freed-address verification is a side-effect of the hash lookup in
* any hashed cache. Therefore, to implement KMF_VERIFY debugging for
* all caches, all we have to do is set the hashing threshold to zero.
* In order to providing audit information we also need to set
* the hashing threshold to zero.
*/
if (kmem_flags & (KMF_VERIFY | KMF_AUDIT))
kmem_maxraw = 0;
if (kmem_flags & KMF_PAGEPERBUF)
kmem_align = PAGESIZE;
if (kmem_flags & KMF_AUDIT) {
/*
* Use about 2% of available memory for the transaction log.
*/
u_int physmegs = physmem >> (20 - PAGESHIFT);
u_int virtmegs = (Syslimit - Sysbase) >> 20;
if (kmem_log_size == 0)
kmem_log_size = min(physmegs, virtmegs) *
(20000 / sizeof (kmem_bufctl_audit_t));
kmem_log_size = 1 << highbit(kmem_log_size);
kmem_log = kmem_perm_alloc(kmem_log_size *
sizeof (kmem_bufctl_audit_t), 0, KM_SLEEP);
bzero((void *)kmem_log,
kmem_log_size * sizeof (kmem_bufctl_audit_t));
}
/*
* Set up the default caches to back kmem_alloc()
*/
size = KMEM_ALIGN;
for (i = 0; i < sizeof (kmem_alloc_sizes) / sizeof (int); i++) {
char name[40];
int align = KMEM_ALIGN;
cache_size = kmem_alloc_sizes[i];
if (cache_size > kmem_maxraw &&
(cache_size & (cache_size - 1)) == 0)
align = cache_size;
if ((cache_size & PAGEOFFSET) == 0)
align = PAGESIZE;
sprintf(name, "kmem_alloc_%d", cache_size);
cp = kmem_cache_create(name, cache_size, align,
NULL, NULL, NULL);
while (size <= cache_size) {
kmem_alloc_table[(size - 1) >> KMEM_ALIGN_SHIFT] = cp;
size += KMEM_ALIGN;
}
}
kmem_flags |= KMF_READY;
kmem_ready = 1;
}