-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
SAS7BDAT parser: Fast byteswap #47403
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 21 commits
Commits
Show all changes
27 commits
Select commit
Hold shift + click to select a range
5b9cd4b
Fast byteswap
jonashaag 17c965f
Add types
jonashaag 51499fb
Merge branch 'main' into sas/byteswap
jonashaag 435a003
Review feedback
jonashaag 10ab87f
Slightly faster variant (1 less bytes obj construction)
jonashaag ad74f5c
Make MyPy happy?
jonashaag 9c5b4b3
Update sas7bdat.py
jonashaag 21c364c
Merge branch 'main' into sas/byteswap
jonashaag 148fa75
Merge branch 'main' into sas/byteswap
jonashaag f3c63f0
Use intrinsics
jonashaag 78de495
Merge branch 'main' into sas/byteswap
jonashaag 4ef928e
Merge branch 'main' into sas/byteswap
jonashaag c310c0d
Lint
jonashaag 3b7ba83
Add tests + move byteswap to module
jonashaag 53fbce2
Add float tests + refactoring
jonashaag 9cbc5be
Undo unrelated changes
jonashaag 4802848
Undo unrelated changes
jonashaag 41abe02
Lint
jonashaag 2abd8e0
Merge branch 'main' into sas/byteswap
jonashaag bf0976a
Update v1.6.0.rst
jonashaag c725d49
Merge branch 'main' into sas/byteswap
jonashaag c7c1a2f
read_int -> read_uint
jonashaag 6a4a556
Lint
jonashaag 9f5ba3f
Merge branch 'main' into sas/byteswap
jonashaag a439434
Update sas7bdat.py
jonashaag 55bd863
Merge branch 'main' into sas/byteswap
jonashaag bdf8203
Merge branch 'main' into sas/byteswap
jonashaag File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
def read_float_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ... | ||
def read_double_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ... | ||
def read_uint16_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ... | ||
def read_uint32_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ... | ||
def read_uint64_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ... |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
""" | ||
The following are faster versions of struct.unpack that avoid the overhead of Python function calls. | ||
|
||
In the SAS7BDAT parser, they may be called up to (n_rows * n_cols) times. | ||
""" | ||
from cython cimport Py_ssize_t | ||
from libc.stdint cimport ( | ||
uint16_t, | ||
uint32_t, | ||
uint64_t, | ||
) | ||
|
||
|
||
def read_float_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): | ||
assert offset + 4 < len(data) | ||
cdef: | ||
const char *data_ptr = data | ||
float res = (<float*>(data_ptr + offset))[0] | ||
if byteswap: | ||
res = _byteswap_float(res) | ||
return res | ||
|
||
|
||
def read_double_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): | ||
assert offset + 8 < len(data) | ||
cdef: | ||
const char *data_ptr = data | ||
double res = (<double*>(data_ptr + offset))[0] | ||
if byteswap: | ||
res = _byteswap_double(res) | ||
return res | ||
|
||
|
||
def read_uint16_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): | ||
assert offset + 2 < len(data) | ||
cdef: | ||
const char *data_ptr = data | ||
uint16_t res = (<uint16_t *>(data_ptr + offset))[0] | ||
if byteswap: | ||
res = _byteswap2(res) | ||
return res | ||
|
||
|
||
def read_uint32_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): | ||
assert offset + 4 < len(data) | ||
cdef: | ||
const char *data_ptr = data | ||
uint32_t res = (<uint32_t *>(data_ptr + offset))[0] | ||
if byteswap: | ||
res = _byteswap4(res) | ||
return res | ||
|
||
|
||
def read_uint64_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): | ||
assert offset + 8 < len(data) | ||
cdef: | ||
const char *data_ptr = data | ||
uint64_t res = (<uint64_t *>(data_ptr + offset))[0] | ||
if byteswap: | ||
res = _byteswap8(res) | ||
return res | ||
|
||
|
||
# Byteswapping | ||
|
||
cdef extern from *: | ||
""" | ||
#ifdef _MSC_VER | ||
#define _byteswap2 _byteswap_ushort | ||
#define _byteswap4 _byteswap_ulong | ||
#define _byteswap8 _byteswap_uint64 | ||
#else | ||
#define _byteswap2 __builtin_bswap16 | ||
#define _byteswap4 __builtin_bswap32 | ||
#define _byteswap8 __builtin_bswap64 | ||
#endif | ||
""" | ||
uint16_t _byteswap2(uint16_t) | ||
uint32_t _byteswap4(uint32_t) | ||
uint64_t _byteswap8(uint64_t) | ||
|
||
|
||
cdef inline float _byteswap_float(float num): | ||
cdef uint32_t *intptr = <uint32_t *>&num | ||
intptr[0] = _byteswap4(intptr[0]) | ||
return num | ||
|
||
|
||
cdef inline double _byteswap_double(double num): | ||
cdef uint64_t *intptr = <uint64_t *>&num | ||
intptr[0] = _byteswap8(intptr[0]) | ||
return num |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
from hypothesis import ( | ||
assume, | ||
example, | ||
given, | ||
strategies as st, | ||
) | ||
import numpy as np | ||
import pytest | ||
|
||
import pandas._testing as tm | ||
|
||
from pandas.io.sas._byteswap import ( | ||
read_double_with_byteswap, | ||
read_float_with_byteswap, | ||
read_uint16_with_byteswap, | ||
read_uint32_with_byteswap, | ||
read_uint64_with_byteswap, | ||
) | ||
|
||
|
||
@given(read_offset=st.integers(0, 11), number=st.integers(min_value=0)) | ||
@example(number=2**16, read_offset=0) | ||
@example(number=2**32, read_offset=0) | ||
@example(number=2**64, read_offset=0) | ||
@pytest.mark.parametrize("int_type", [np.uint16, np.uint32, np.uint64]) | ||
@pytest.mark.parametrize("should_byteswap", [True, False]) | ||
def test_int_byteswap(read_offset, number, int_type, should_byteswap): | ||
assume(number < 2 ** (8 * int_type(0).itemsize)) | ||
_test(number, int_type, read_offset, should_byteswap) | ||
|
||
|
||
@given(read_offset=st.integers(0, 11), number=st.floats()) | ||
@pytest.mark.parametrize("float_type", [np.float32, np.float64]) | ||
@pytest.mark.parametrize("should_byteswap", [True, False]) | ||
def test_float_byteswap(read_offset, number, float_type, should_byteswap): | ||
_test(number, float_type, read_offset, should_byteswap) | ||
|
||
|
||
def _test(number, number_type, read_offset, should_byteswap): | ||
number = number_type(number) | ||
data = np.random.default_rng().integers(0, 256, size=20, dtype="uint8") | ||
data[read_offset : read_offset + number.itemsize] = number[None].view("uint8") | ||
swap_func = { | ||
np.float32: read_float_with_byteswap, | ||
np.float64: read_double_with_byteswap, | ||
np.uint16: read_uint16_with_byteswap, | ||
np.uint32: read_uint32_with_byteswap, | ||
np.uint64: read_uint64_with_byteswap, | ||
}[type(number)] | ||
output_number = number_type(swap_func(bytes(data), read_offset, should_byteswap)) | ||
if should_byteswap: | ||
tm.assert_equal(output_number, number.byteswap()) | ||
else: | ||
tm.assert_equal(output_number, number) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.