This patch prepares the slab allocator to handle caches having annotations
(useroffset and usersize) defining usercopy regions.
This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY
whitelisting code in the last public patch of grsecurity/PaX based on
my understanding of the code. Changes or omissions from the original
code are mine and don't reflect the original grsecurity/PaX code.
Currently, hardened usercopy performs dynamic bounds checking on slab
cache objects. This is good, but still leaves a lot of kernel memory
available to be copied to/from userspace in the face of bugs. To further
restrict what memory is available for copying, this creates a way to
whitelist specific areas of a given slab cache object for copying to/from
userspace, allowing much finer granularity of access control. Slab caches
that are never exposed to userspace can declare no whitelist for their
objects, thereby keeping them unavailable to userspace via dynamic copy
operations. (Note, an implicit form of whitelisting is the use of constant
sizes in usercopy operations and get_user()/put_user(); these bypass
hardened usercopy checks since these sizes cannot change at runtime.)
To support this whitelist annotation, usercopy region offset and size
members are added to struct kmem_cache. The slab allocator receives a
new function, kmem_cache_create_usercopy(), that creates a new cache
with a usercopy region defined, suitable for declaring spans of fields
within the objects that get copied to/from userspace.
In this patch, the default kmem_cache_create() marks the entire allocation
as whitelisted, leaving it semantically unchanged. Once all fine-grained
whitelists have been added (in subsequent patches), this will be changed
to a usersize of 0, making caches created with kmem_cache_create() not
copyable to/from userspace.
After the entire usercopy whitelist series is applied, less than 15%
of the slab cache memory remains exposed to potential usercopy bugs
after a fresh boot:
Total Slab Memory: 48074720
Usercopyable Memory: 6367532 13.2%
task_struct 0.2% 4480/1630720
RAW 0.3% 300/96000
RAWv6 2.1% 1408/64768
ext4_inode_cache 3.0% 269760/8740224
dentry 11.1% 585984/5273856
mm_struct 29.1% 54912/188448
kmalloc-8 100.0% 24576/24576
kmalloc-16 100.0% 28672/28672
kmalloc-32 100.0% 81920/81920
kmalloc-192 100.0% 96768/96768
kmalloc-128 100.0% 143360/143360
names_cache 100.0% 163840/163840
kmalloc-64 100.0% 167936/167936
kmalloc-256 100.0% 339968/339968
kmalloc-512 100.0% 350720/350720
kmalloc-96 100.0% 455616/455616
kmalloc-8192 100.0% 655360/655360
kmalloc-1024 100.0% 812032/812032
kmalloc-4096 100.0% 819200/819200
kmalloc-2048 100.0% 1310720/1310720
After some kernel build workloads, the percentage (mainly driven by
dentry and inode caches expanding) drops under 10%:
Total Slab Memory: 95516184
Usercopyable Memory: 8497452 8.8%
task_struct 0.2% 4000/1456000
RAW 0.3% 300/96000
RAWv6 2.1% 1408/64768
ext4_inode_cache 3.0% 1217280/39439872
dentry 11.1% 1623200/14608800
mm_struct 29.1% 73216/251264
kmalloc-8 100.0% 24576/24576
kmalloc-16 100.0% 28672/28672
kmalloc-32 100.0% 94208/94208
kmalloc-192 100.0% 96768/96768
kmalloc-128 100.0% 143360/143360
names_cache 100.0% 163840/163840
kmalloc-64 100.0% 245760/245760
kmalloc-256 100.0% 339968/339968
kmalloc-512 100.0% 350720/350720
kmalloc-96 100.0% 563520/563520
kmalloc-8192 100.0% 655360/655360
kmalloc-1024 100.0% 794624/794624
kmalloc-4096 100.0% 819200/819200
kmalloc-2048 100.0% 1257472/1257472
Signed-off-by: David Windsor <dave@nullcore.net>
[kees: adjust commit log, split out a few extra kmalloc hunks]
[kees: add field names to function declarations]
[kees: convert BUGs to WARNs and fail closed]
[kees: add attack surface reduction analysis to commit log]
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Cc: linux-xfs@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Christoph Lameter <cl@linux.com>
181 lines
5.4 KiB
C
181 lines
5.4 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_SLUB_DEF_H
|
|
#define _LINUX_SLUB_DEF_H
|
|
|
|
/*
|
|
* SLUB : A Slab allocator without object queues.
|
|
*
|
|
* (C) 2007 SGI, Christoph Lameter
|
|
*/
|
|
#include <linux/kobject.h>
|
|
|
|
enum stat_item {
|
|
ALLOC_FASTPATH, /* Allocation from cpu slab */
|
|
ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */
|
|
FREE_FASTPATH, /* Free to cpu slab */
|
|
FREE_SLOWPATH, /* Freeing not to cpu slab */
|
|
FREE_FROZEN, /* Freeing to frozen slab */
|
|
FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */
|
|
FREE_REMOVE_PARTIAL, /* Freeing removes last object */
|
|
ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */
|
|
ALLOC_SLAB, /* Cpu slab acquired from page allocator */
|
|
ALLOC_REFILL, /* Refill cpu slab from slab freelist */
|
|
ALLOC_NODE_MISMATCH, /* Switching cpu slab */
|
|
FREE_SLAB, /* Slab freed to the page allocator */
|
|
CPUSLAB_FLUSH, /* Abandoning of the cpu slab */
|
|
DEACTIVATE_FULL, /* Cpu slab was full when deactivated */
|
|
DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */
|
|
DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */
|
|
DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */
|
|
DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */
|
|
DEACTIVATE_BYPASS, /* Implicit deactivation */
|
|
ORDER_FALLBACK, /* Number of times fallback was necessary */
|
|
CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */
|
|
CMPXCHG_DOUBLE_FAIL, /* Number of times that cmpxchg double did not match */
|
|
CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */
|
|
CPU_PARTIAL_FREE, /* Refill cpu partial on free */
|
|
CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */
|
|
CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */
|
|
NR_SLUB_STAT_ITEMS };
|
|
|
|
struct kmem_cache_cpu {
|
|
void **freelist; /* Pointer to next available object */
|
|
unsigned long tid; /* Globally unique transaction id */
|
|
struct page *page; /* The slab from which we are allocating */
|
|
#ifdef CONFIG_SLUB_CPU_PARTIAL
|
|
struct page *partial; /* Partially allocated frozen slabs */
|
|
#endif
|
|
#ifdef CONFIG_SLUB_STATS
|
|
unsigned stat[NR_SLUB_STAT_ITEMS];
|
|
#endif
|
|
};
|
|
|
|
#ifdef CONFIG_SLUB_CPU_PARTIAL
|
|
#define slub_percpu_partial(c) ((c)->partial)
|
|
|
|
#define slub_set_percpu_partial(c, p) \
|
|
({ \
|
|
slub_percpu_partial(c) = (p)->next; \
|
|
})
|
|
|
|
#define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c))
|
|
#else
|
|
#define slub_percpu_partial(c) NULL
|
|
|
|
#define slub_set_percpu_partial(c, p)
|
|
|
|
#define slub_percpu_partial_read_once(c) NULL
|
|
#endif // CONFIG_SLUB_CPU_PARTIAL
|
|
|
|
/*
|
|
* Word size structure that can be atomically updated or read and that
|
|
* contains both the order and the number of objects that a slab of the
|
|
* given order would contain.
|
|
*/
|
|
struct kmem_cache_order_objects {
|
|
unsigned long x;
|
|
};
|
|
|
|
/*
|
|
* Slab cache management.
|
|
*/
|
|
struct kmem_cache {
|
|
struct kmem_cache_cpu __percpu *cpu_slab;
|
|
/* Used for retriving partial slabs etc */
|
|
slab_flags_t flags;
|
|
unsigned long min_partial;
|
|
int size; /* The size of an object including meta data */
|
|
int object_size; /* The size of an object without meta data */
|
|
int offset; /* Free pointer offset. */
|
|
#ifdef CONFIG_SLUB_CPU_PARTIAL
|
|
int cpu_partial; /* Number of per cpu partial objects to keep around */
|
|
#endif
|
|
struct kmem_cache_order_objects oo;
|
|
|
|
/* Allocation and freeing of slabs */
|
|
struct kmem_cache_order_objects max;
|
|
struct kmem_cache_order_objects min;
|
|
gfp_t allocflags; /* gfp flags to use on each alloc */
|
|
int refcount; /* Refcount for slab cache destroy */
|
|
void (*ctor)(void *);
|
|
int inuse; /* Offset to metadata */
|
|
int align; /* Alignment */
|
|
int reserved; /* Reserved bytes at the end of slabs */
|
|
int red_left_pad; /* Left redzone padding size */
|
|
const char *name; /* Name (only for display!) */
|
|
struct list_head list; /* List of slab caches */
|
|
#ifdef CONFIG_SYSFS
|
|
struct kobject kobj; /* For sysfs */
|
|
struct work_struct kobj_remove_work;
|
|
#endif
|
|
#ifdef CONFIG_MEMCG
|
|
struct memcg_cache_params memcg_params;
|
|
int max_attr_size; /* for propagation, maximum size of a stored attr */
|
|
#ifdef CONFIG_SYSFS
|
|
struct kset *memcg_kset;
|
|
#endif
|
|
#endif
|
|
|
|
#ifdef CONFIG_SLAB_FREELIST_HARDENED
|
|
unsigned long random;
|
|
#endif
|
|
|
|
#ifdef CONFIG_NUMA
|
|
/*
|
|
* Defragmentation by allocating from a remote node.
|
|
*/
|
|
int remote_node_defrag_ratio;
|
|
#endif
|
|
|
|
#ifdef CONFIG_SLAB_FREELIST_RANDOM
|
|
unsigned int *random_seq;
|
|
#endif
|
|
|
|
#ifdef CONFIG_KASAN
|
|
struct kasan_cache kasan_info;
|
|
#endif
|
|
|
|
size_t useroffset; /* Usercopy region offset */
|
|
size_t usersize; /* Usercopy region size */
|
|
|
|
struct kmem_cache_node *node[MAX_NUMNODES];
|
|
};
|
|
|
|
#ifdef CONFIG_SLUB_CPU_PARTIAL
|
|
#define slub_cpu_partial(s) ((s)->cpu_partial)
|
|
#define slub_set_cpu_partial(s, n) \
|
|
({ \
|
|
slub_cpu_partial(s) = (n); \
|
|
})
|
|
#else
|
|
#define slub_cpu_partial(s) (0)
|
|
#define slub_set_cpu_partial(s, n)
|
|
#endif // CONFIG_SLUB_CPU_PARTIAL
|
|
|
|
#ifdef CONFIG_SYSFS
|
|
#define SLAB_SUPPORTS_SYSFS
|
|
void sysfs_slab_release(struct kmem_cache *);
|
|
#else
|
|
static inline void sysfs_slab_release(struct kmem_cache *s)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
void object_err(struct kmem_cache *s, struct page *page,
|
|
u8 *object, char *reason);
|
|
|
|
void *fixup_red_left(struct kmem_cache *s, void *p);
|
|
|
|
static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
|
|
void *x) {
|
|
void *object = x - (x - page_address(page)) % cache->size;
|
|
void *last_object = page_address(page) +
|
|
(page->objects - 1) * cache->size;
|
|
void *result = (unlikely(object > last_object)) ? last_object : object;
|
|
|
|
result = fixup_red_left(cache, result);
|
|
return result;
|
|
}
|
|
|
|
#endif /* _LINUX_SLUB_DEF_H */
|