Skip to content

Commit 75a1a60

Browse files
borkmannAlexei Starovoitov
authored and
Alexei Starovoitov
committed
uaccess: Add strict non-pagefault kernel-space read function
Add two new probe_kernel_read_strict() and strncpy_from_unsafe_strict() helpers which by default alias to the __probe_kernel_read() and the __strncpy_from_unsafe(), respectively, but can be overridden by archs which have non-overlapping address ranges for kernel space and user space in order to bail out with -EFAULT when attempting to probe user memory including non-canonical user access addresses [0]: 4-level page tables: user-space mem: 0x0000000000000000 - 0x00007fffffffffff non-canonical: 0x0000800000000000 - 0xffff7fffffffffff 5-level page tables: user-space mem: 0x0000000000000000 - 0x00ffffffffffffff non-canonical: 0x0100000000000000 - 0xfeffffffffffffff The idea is that these helpers are complementary to the probe_user_read() and strncpy_from_unsafe_user() which probe user-only memory. Both added helpers here do the same, but for kernel-only addresses. Both set of helpers are going to be used for BPF tracing. They also explicitly avoid throwing the splat for non-canonical user addresses from 00c4237 ("x86-64: add warning for non-canonical user access address dereferences"). For compat, the current probe_kernel_read() and strncpy_from_unsafe() are left as-is. [0] Documentation/x86/x86_64/mm.txt Signed-off-by: Daniel Borkmann <[email protected]> Signed-off-by: Alexei Starovoitov <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Masami Hiramatsu <[email protected]> Cc: [email protected] Link: https://lore.kernel.org/bpf/eefeefd769aa5a013531f491a71f0936779e916b.1572649915.git.daniel@iogearbox.net
1 parent 1d1585c commit 75a1a60

File tree

4 files changed

+72
-2
lines changed

4 files changed

+72
-2
lines changed

arch/x86/mm/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
1313
endif
1414

1515
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
16-
pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o
16+
pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o
1717

1818
# Make sure __phys_addr has no stackprotector
1919
nostackp := $(call cc-option, -fno-stack-protector)

arch/x86/mm/maccess.c

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
// SPDX-License-Identifier: GPL-2.0-only
2+
3+
#include <linux/uaccess.h>
4+
#include <linux/kernel.h>
5+
6+
#ifdef CONFIG_X86_64
7+
static __always_inline u64 canonical_address(u64 vaddr, u8 vaddr_bits)
8+
{
9+
return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits);
10+
}
11+
12+
static __always_inline bool invalid_probe_range(u64 vaddr)
13+
{
14+
/*
15+
* Range covering the highest possible canonical userspace address
16+
* as well as non-canonical address range. For the canonical range
17+
* we also need to include the userspace guard page.
18+
*/
19+
return vaddr < TASK_SIZE_MAX + PAGE_SIZE ||
20+
canonical_address(vaddr, boot_cpu_data.x86_virt_bits) != vaddr;
21+
}
22+
#else
23+
static __always_inline bool invalid_probe_range(u64 vaddr)
24+
{
25+
return vaddr < TASK_SIZE_MAX;
26+
}
27+
#endif
28+
29+
long probe_kernel_read_strict(void *dst, const void *src, size_t size)
30+
{
31+
if (unlikely(invalid_probe_range((unsigned long)src)))
32+
return -EFAULT;
33+
34+
return __probe_kernel_read(dst, src, size);
35+
}
36+
37+
long strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr, long count)
38+
{
39+
if (unlikely(invalid_probe_range((unsigned long)unsafe_addr)))
40+
return -EFAULT;
41+
42+
return __strncpy_from_unsafe(dst, unsafe_addr, count);
43+
}

include/linux/uaccess.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,7 @@ copy_struct_from_user(void *dst, size_t ksize, const void __user *src,
311311
* happens, handle that and return -EFAULT.
312312
*/
313313
extern long probe_kernel_read(void *dst, const void *src, size_t size);
314+
extern long probe_kernel_read_strict(void *dst, const void *src, size_t size);
314315
extern long __probe_kernel_read(void *dst, const void *src, size_t size);
315316

316317
/*
@@ -350,6 +351,9 @@ extern long notrace probe_user_write(void __user *dst, const void *src, size_t s
350351
extern long notrace __probe_user_write(void __user *dst, const void *src, size_t size);
351352

352353
extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
354+
extern long strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr,
355+
long count);
356+
extern long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
353357
extern long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
354358
long count);
355359
extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count);

mm/maccess.c

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,20 @@ probe_write_common(void __user *dst, const void *src, size_t size)
4343
* do_page_fault() doesn't attempt to take mmap_sem. This makes
4444
* probe_kernel_read() suitable for use within regions where the caller
4545
* already holds mmap_sem, or other locks which nest inside mmap_sem.
46+
*
47+
* probe_kernel_read_strict() is the same as probe_kernel_read() except for
48+
* the case where architectures have non-overlapping user and kernel address
49+
* ranges: probe_kernel_read_strict() will additionally return -EFAULT for
50+
* probing memory on a user address range where probe_user_read() is supposed
51+
* to be used instead.
4652
*/
4753

4854
long __weak probe_kernel_read(void *dst, const void *src, size_t size)
4955
__attribute__((alias("__probe_kernel_read")));
5056

57+
long __weak probe_kernel_read_strict(void *dst, const void *src, size_t size)
58+
__attribute__((alias("__probe_kernel_read")));
59+
5160
long __probe_kernel_read(void *dst, const void *src, size_t size)
5261
{
5362
long ret;
@@ -157,8 +166,22 @@ EXPORT_SYMBOL_GPL(probe_user_write);
157166
*
158167
* If @count is smaller than the length of the string, copies @count-1 bytes,
159168
* sets the last byte of @dst buffer to NUL and returns @count.
169+
*
170+
* strncpy_from_unsafe_strict() is the same as strncpy_from_unsafe() except
171+
* for the case where architectures have non-overlapping user and kernel address
172+
* ranges: strncpy_from_unsafe_strict() will additionally return -EFAULT for
173+
* probing memory on a user address range where strncpy_from_unsafe_user() is
174+
* supposed to be used instead.
160175
*/
161-
long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
176+
177+
long __weak strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
178+
__attribute__((alias("__strncpy_from_unsafe")));
179+
180+
long __weak strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr,
181+
long count)
182+
__attribute__((alias("__strncpy_from_unsafe")));
183+
184+
long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
162185
{
163186
mm_segment_t old_fs = get_fs();
164187
const void *src = unsafe_addr;

0 commit comments

Comments
 (0)