-
-
Notifications
You must be signed in to change notification settings - Fork 144
ENH: Synchronize io/stata with pandas master #202
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
34efdbc
163cebd
49734ca
646a60f
5d3648c
52e104d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ from typing import ( | |
Hashable, | ||
Literal, | ||
Sequence, | ||
overload, | ||
) | ||
|
||
import numpy as np | ||
|
@@ -22,6 +23,7 @@ from pandas._typing import ( | |
WriteBuffer, | ||
) | ||
|
||
@overload | ||
def read_stata( | ||
path: FilePath | ReadBuffer[bytes], | ||
convert_dates: bool = ..., | ||
|
@@ -32,57 +34,46 @@ def read_stata( | |
columns: list[HashableT] | None = ..., | ||
order_categoricals: bool = ..., | ||
chunksize: int | None = ..., | ||
iterator: bool = ..., | ||
*, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so the second and third overload only different in whether the arguments are keyword-only or can also be provided as positional arguments? If pandas would have deprecated positional arguments, I would remove the positional overloads - but they are not yet deprecated. (Will try to open a PR for that later today at pandas.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yup. They are needed to handle the cases where There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I opened pandas-dev/pandas#48128 to make most arguments keyword-only There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. using positional keywords will now be deprecated in 1.5 (except for the first non-self argument) |
||
iterator: Literal[True], | ||
compression: CompressionOptions = ..., | ||
storage_options: StorageOptions = ..., | ||
) -> DataFrame | StataReader: ... | ||
|
||
stata_epoch: datetime.datetime = ... | ||
excessive_string_length_error: str | ||
) -> StataReader: ... | ||
@overload | ||
def read_stata( | ||
path: FilePath | ReadBuffer[bytes], | ||
convert_dates: bool, | ||
convert_categoricals: bool, | ||
index_col: str | None, | ||
convert_missing: bool, | ||
preserve_dtypes: bool, | ||
columns: list[HashableT] | None, | ||
order_categoricals: bool, | ||
chunksize: int | None, | ||
iterator: Literal[True], | ||
compression: CompressionOptions = ..., | ||
storage_options: StorageOptions = ..., | ||
) -> StataReader: ... | ||
@overload | ||
def read_stata( | ||
path: FilePath | ReadBuffer[bytes], | ||
convert_dates: bool = ..., | ||
convert_categoricals: bool = ..., | ||
index_col: str | None = ..., | ||
convert_missing: bool = ..., | ||
preserve_dtypes: bool = ..., | ||
columns: list[HashableT] | None = ..., | ||
order_categoricals: bool = ..., | ||
chunksize: int | None = ..., | ||
iterator: Literal[False] = ..., | ||
compression: CompressionOptions = ..., | ||
storage_options: StorageOptions = ..., | ||
) -> DataFrame: ... | ||
|
||
class PossiblePrecisionLoss(Warning): ... | ||
|
||
precision_loss_doc: str | ||
|
||
class ValueLabelTypeMismatch(Warning): ... | ||
|
||
value_label_mismatch_doc: str | ||
|
||
class InvalidColumnName(Warning): ... | ||
|
||
invalid_name_doc: str | ||
|
||
class StataValueLabel: | ||
labname: Hashable = ... | ||
value_labels: list[tuple[float, str]] = ... | ||
text_len: int = ... | ||
off: npt.NDArray[np.int32] = ... | ||
val: npt.NDArray[np.int32] = ... | ||
txt: list[bytes] = ... | ||
n: int = ... | ||
len: int = ... | ||
def __init__( | ||
self, catarray: pd.Series, encoding: Literal["latin-1", "utf-8"] = ... | ||
) -> None: ... | ||
def generate_value_label(self, byteorder: str) -> bytes: ... | ||
|
||
class StataMissingValue: | ||
MISSING_VALUES: dict[float, str] = ... | ||
bases: tuple[int, int, int] = ... | ||
float32_base: bytes = ... | ||
increment: int = ... | ||
int_value: int = ... | ||
float64_base: bytes = ... | ||
BASE_MISSING_VALUES: dict[str, int] = ... | ||
def __init__(self, value: float) -> None: ... | ||
def __eq__(self, other: object) -> bool: ... | ||
@property | ||
def string(self) -> str: ... | ||
@property | ||
def value(self) -> float: ... | ||
@classmethod | ||
def get_base_missing_value(cls, dtype): ... | ||
|
||
class StataParser: | ||
Dr-Irv marked this conversation as resolved.
Show resolved
Hide resolved
|
||
DTYPE_MAP: dict[int, np.dtype] = ... | ||
DTYPE_MAP_XML: dict[int, np.dtype] = ... | ||
|
@@ -160,19 +151,6 @@ class StataWriter(StataParser): | |
) -> None: ... | ||
def write_file(self) -> None: ... | ||
|
||
class StataStrLWriter: | ||
df: DataFrame = ... | ||
columns: Sequence[str] = ... | ||
def __init__( | ||
self, | ||
df: DataFrame, | ||
columns: Sequence[str], | ||
version: int = ..., | ||
byteorder: str | None = ..., | ||
) -> None: ... | ||
def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]: ... | ||
def generate_blob(self, gso_table: dict[str, tuple[int, int]]) -> bytes: ... | ||
|
||
class StataWriter117(StataWriter): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
def __init__( | ||
self, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
from __future__ import annotations | ||
|
||
from contextlib import contextmanager | ||
from pathlib import Path | ||
import tempfile | ||
from typing import ( | ||
IO, | ||
Any, | ||
) | ||
import uuid | ||
|
||
import pandas as pd | ||
from pandas import DataFrame | ||
from typing_extensions import assert_type | ||
|
||
from tests import check | ||
|
||
from pandas.io.stata import ( | ||
StataReader, | ||
read_stata, | ||
) | ||
|
||
DF = DataFrame({"a": [1, 2, 3], "b": [0.0, 0.0, 0.0]}) | ||
|
||
|
||
@contextmanager | ||
def ensure_clean(filename=None, return_filelike: bool = False, **kwargs: Any): | ||
Dr-Irv marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
Gets a temporary path and agrees to remove on close. | ||
This implementation does not use tempfile.mkstemp to avoid having a file handle. | ||
If the code using the returned path wants to delete the file itself, windows | ||
requires that no program has a file handle to it. | ||
Parameters | ||
---------- | ||
filename : str (optional) | ||
suffix of the created file. | ||
return_filelike : bool (default False) | ||
if True, returns a file-like which is *always* cleaned. Necessary for | ||
savefig and other functions which want to append extensions. | ||
**kwargs | ||
Additional keywords are passed to open(). | ||
""" | ||
folder = Path(tempfile.gettempdir()) | ||
|
||
if filename is None: | ||
filename = "" | ||
filename = str(uuid.uuid4()) + filename | ||
path = folder / filename | ||
|
||
path.touch() | ||
|
||
handle_or_str: str | IO = str(path) | ||
if return_filelike: | ||
kwargs.setdefault("mode", "w+b") | ||
handle_or_str = open(path, **kwargs) | ||
|
||
try: | ||
yield handle_or_str | ||
finally: | ||
if not isinstance(handle_or_str, str): | ||
handle_or_str.close() | ||
if path.is_file(): | ||
path.unlink() | ||
|
||
|
||
def test_read_stata_df(): | ||
with ensure_clean() as path: | ||
DF.to_stata(path) | ||
check(assert_type(read_stata(path), pd.DataFrame), pd.DataFrame) | ||
|
||
|
||
def test_read_stata_iterator_positional(): | ||
with ensure_clean() as path: | ||
str_path = str(path) | ||
DF.to_stata(str_path) | ||
check( | ||
assert_type( | ||
read_stata( | ||
str_path, False, False, None, False, False, None, False, 2, True | ||
), | ||
StataReader, | ||
), | ||
StataReader, | ||
) | ||
|
||
|
||
def test_read_stata_iterator(): | ||
with ensure_clean() as path: | ||
str_path = str(path) | ||
DF.to_stata(str_path) | ||
check( | ||
assert_type(read_stata(str_path, iterator=True), StataReader), StataReader | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looking at the code, only values for
str
are[ "tc", "%tc", "td", "%td", "tw", "%tw", "tm", "%tm", "tq", "%tq", "th", "%th", "ty", "%ty", ]