Skip to content

Commit 99c55f7

Browse files
Alexei Starovoitovdavem330
Alexei Starovoitov
authored andcommitted
bpf: introduce BPF syscall and maps
BPF syscall is a multiplexor for a range of different operations on eBPF. This patch introduces syscall with single command to create a map. Next patch adds commands to access maps. 'maps' is a generic storage of different types for sharing data between kernel and userspace. Userspace example: /* this syscall wrapper creates a map with given type and attributes * and returns map_fd on success. * use close(map_fd) to delete the map */ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries) { union bpf_attr attr = { .map_type = map_type, .key_size = key_size, .value_size = value_size, .max_entries = max_entries }; return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } 'union bpf_attr' is backwards compatible with future extensions. More details in Documentation/networking/filter.txt and in manpage Signed-off-by: Alexei Starovoitov <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 4a8e320 commit 99c55f7

File tree

5 files changed

+273
-1
lines changed

5 files changed

+273
-1
lines changed

Documentation/networking/filter.txt

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1001,6 +1001,45 @@ instruction that loads 64-bit immediate value into a dst_reg.
10011001
Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads
10021002
32-bit immediate value into a register.
10031003

1004+
eBPF maps
1005+
---------
1006+
'maps' is a generic storage of different types for sharing data between kernel
1007+
and userspace.
1008+
1009+
The maps are accessed from user space via BPF syscall, which has commands:
1010+
- create a map with given type and attributes
1011+
map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
1012+
using attr->map_type, attr->key_size, attr->value_size, attr->max_entries
1013+
returns process-local file descriptor or negative error
1014+
1015+
- lookup key in a given map
1016+
err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
1017+
using attr->map_fd, attr->key, attr->value
1018+
returns zero and stores found elem into value or negative error
1019+
1020+
- create or update key/value pair in a given map
1021+
err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
1022+
using attr->map_fd, attr->key, attr->value
1023+
returns zero or negative error
1024+
1025+
- find and delete element by key in a given map
1026+
err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
1027+
using attr->map_fd, attr->key
1028+
1029+
- to delete map: close(fd)
1030+
Exiting process will delete maps automatically
1031+
1032+
userspace programs use this syscall to create/access maps that eBPF programs
1033+
are concurrently updating.
1034+
1035+
maps can have different types: hash, array, bloom filter, radix-tree, etc.
1036+
1037+
The map is defined by:
1038+
. type
1039+
. max number of elements
1040+
. key size in bytes
1041+
. value size in bytes
1042+
10041043
Testing
10051044
-------
10061045

include/linux/bpf.h

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2+
*
3+
* This program is free software; you can redistribute it and/or
4+
* modify it under the terms of version 2 of the GNU General Public
5+
* License as published by the Free Software Foundation.
6+
*/
7+
#ifndef _LINUX_BPF_H
8+
#define _LINUX_BPF_H 1
9+
10+
#include <uapi/linux/bpf.h>
11+
#include <linux/workqueue.h>
12+
13+
struct bpf_map;
14+
15+
/* map is generic key/value storage optionally accesible by eBPF programs */
16+
struct bpf_map_ops {
17+
/* funcs callable from userspace (via syscall) */
18+
struct bpf_map *(*map_alloc)(union bpf_attr *attr);
19+
void (*map_free)(struct bpf_map *);
20+
};
21+
22+
struct bpf_map {
23+
atomic_t refcnt;
24+
enum bpf_map_type map_type;
25+
u32 key_size;
26+
u32 value_size;
27+
u32 max_entries;
28+
struct bpf_map_ops *ops;
29+
struct work_struct work;
30+
};
31+
32+
struct bpf_map_type_list {
33+
struct list_head list_node;
34+
struct bpf_map_ops *ops;
35+
enum bpf_map_type type;
36+
};
37+
38+
void bpf_register_map_type(struct bpf_map_type_list *tl);
39+
void bpf_map_put(struct bpf_map *map);
40+
41+
#endif /* _LINUX_BPF_H */

include/uapi/linux/bpf.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,4 +62,27 @@ struct bpf_insn {
6262
__s32 imm; /* signed immediate constant */
6363
};
6464

65+
/* BPF syscall commands */
66+
enum bpf_cmd {
67+
/* create a map with given type and attributes
68+
* fd = bpf(BPF_MAP_CREATE, union bpf_attr *, u32 size)
69+
* returns fd or negative error
70+
* map is deleted when fd is closed
71+
*/
72+
BPF_MAP_CREATE,
73+
};
74+
75+
enum bpf_map_type {
76+
BPF_MAP_TYPE_UNSPEC,
77+
};
78+
79+
union bpf_attr {
80+
struct { /* anonymous struct used by BPF_MAP_CREATE command */
81+
__u32 map_type; /* one of enum bpf_map_type */
82+
__u32 key_size; /* size of key in bytes */
83+
__u32 value_size; /* size of value in bytes */
84+
__u32 max_entries; /* max number of entries in a map */
85+
};
86+
} __attribute__((aligned(8)));
87+
6588
#endif /* _UAPI__LINUX_BPF_H__ */

kernel/bpf/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
obj-y := core.o
1+
obj-y := core.o syscall.o

kernel/bpf/syscall.c

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2+
*
3+
* This program is free software; you can redistribute it and/or
4+
* modify it under the terms of version 2 of the GNU General Public
5+
* License as published by the Free Software Foundation.
6+
*
7+
* This program is distributed in the hope that it will be useful, but
8+
* WITHOUT ANY WARRANTY; without even the implied warranty of
9+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10+
* General Public License for more details.
11+
*/
12+
#include <linux/bpf.h>
13+
#include <linux/syscalls.h>
14+
#include <linux/slab.h>
15+
#include <linux/anon_inodes.h>
16+
17+
static LIST_HEAD(bpf_map_types);
18+
19+
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
20+
{
21+
struct bpf_map_type_list *tl;
22+
struct bpf_map *map;
23+
24+
list_for_each_entry(tl, &bpf_map_types, list_node) {
25+
if (tl->type == attr->map_type) {
26+
map = tl->ops->map_alloc(attr);
27+
if (IS_ERR(map))
28+
return map;
29+
map->ops = tl->ops;
30+
map->map_type = attr->map_type;
31+
return map;
32+
}
33+
}
34+
return ERR_PTR(-EINVAL);
35+
}
36+
37+
/* boot time registration of different map implementations */
38+
void bpf_register_map_type(struct bpf_map_type_list *tl)
39+
{
40+
list_add(&tl->list_node, &bpf_map_types);
41+
}
42+
43+
/* called from workqueue */
44+
static void bpf_map_free_deferred(struct work_struct *work)
45+
{
46+
struct bpf_map *map = container_of(work, struct bpf_map, work);
47+
48+
/* implementation dependent freeing */
49+
map->ops->map_free(map);
50+
}
51+
52+
/* decrement map refcnt and schedule it for freeing via workqueue
53+
* (unrelying map implementation ops->map_free() might sleep)
54+
*/
55+
void bpf_map_put(struct bpf_map *map)
56+
{
57+
if (atomic_dec_and_test(&map->refcnt)) {
58+
INIT_WORK(&map->work, bpf_map_free_deferred);
59+
schedule_work(&map->work);
60+
}
61+
}
62+
63+
static int bpf_map_release(struct inode *inode, struct file *filp)
64+
{
65+
struct bpf_map *map = filp->private_data;
66+
67+
bpf_map_put(map);
68+
return 0;
69+
}
70+
71+
static const struct file_operations bpf_map_fops = {
72+
.release = bpf_map_release,
73+
};
74+
75+
/* helper macro to check that unused fields 'union bpf_attr' are zero */
76+
#define CHECK_ATTR(CMD) \
77+
memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
78+
sizeof(attr->CMD##_LAST_FIELD), 0, \
79+
sizeof(*attr) - \
80+
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
81+
sizeof(attr->CMD##_LAST_FIELD)) != NULL
82+
83+
#define BPF_MAP_CREATE_LAST_FIELD max_entries
84+
/* called via syscall */
85+
static int map_create(union bpf_attr *attr)
86+
{
87+
struct bpf_map *map;
88+
int err;
89+
90+
err = CHECK_ATTR(BPF_MAP_CREATE);
91+
if (err)
92+
return -EINVAL;
93+
94+
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
95+
map = find_and_alloc_map(attr);
96+
if (IS_ERR(map))
97+
return PTR_ERR(map);
98+
99+
atomic_set(&map->refcnt, 1);
100+
101+
err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
102+
103+
if (err < 0)
104+
/* failed to allocate fd */
105+
goto free_map;
106+
107+
return err;
108+
109+
free_map:
110+
map->ops->map_free(map);
111+
return err;
112+
}
113+
114+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
115+
{
116+
union bpf_attr attr = {};
117+
int err;
118+
119+
/* the syscall is limited to root temporarily. This restriction will be
120+
* lifted when security audit is clean. Note that eBPF+tracing must have
121+
* this restriction, since it may pass kernel data to user space
122+
*/
123+
if (!capable(CAP_SYS_ADMIN))
124+
return -EPERM;
125+
126+
if (!access_ok(VERIFY_READ, uattr, 1))
127+
return -EFAULT;
128+
129+
if (size > PAGE_SIZE) /* silly large */
130+
return -E2BIG;
131+
132+
/* If we're handed a bigger struct than we know of,
133+
* ensure all the unknown bits are 0 - i.e. new
134+
* user-space does not rely on any kernel feature
135+
* extensions we dont know about yet.
136+
*/
137+
if (size > sizeof(attr)) {
138+
unsigned char __user *addr;
139+
unsigned char __user *end;
140+
unsigned char val;
141+
142+
addr = (void __user *)uattr + sizeof(attr);
143+
end = (void __user *)uattr + size;
144+
145+
for (; addr < end; addr++) {
146+
err = get_user(val, addr);
147+
if (err)
148+
return err;
149+
if (val)
150+
return -E2BIG;
151+
}
152+
size = sizeof(attr);
153+
}
154+
155+
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
156+
if (copy_from_user(&attr, uattr, size) != 0)
157+
return -EFAULT;
158+
159+
switch (cmd) {
160+
case BPF_MAP_CREATE:
161+
err = map_create(&attr);
162+
break;
163+
default:
164+
err = -EINVAL;
165+
break;
166+
}
167+
168+
return err;
169+
}

0 commit comments

Comments
 (0)