Skip to content

ENH: Synchronize io/stata with pandas master #202

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Aug 22, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 15 additions & 7 deletions pandas-stubs/core/frame.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,10 @@ from pandas._typing import (
Axes,
Axis,
AxisType,
CompressionOptions,
Dtype,
DtypeNp,
FilePath,
FilePathOrBuffer,
FilePathOrBytesBuffer,
GroupByObjectNonScalar,
Expand All @@ -67,8 +69,10 @@ from pandas._typing import (
Scalar,
ScalarT,
SeriesAxisType,
StorageOptions,
StrLike,
T as TType,
WriteBuffer,
np_ndarray_bool,
np_ndarray_str,
num,
Expand Down Expand Up @@ -236,15 +240,19 @@ class DataFrame(NDFrame, OpsMixin):
) -> np.recarray: ...
def to_stata(
self,
path: FilePathOrBuffer,
convert_dates: dict | None = ...,
path: FilePath | WriteBuffer[bytes],
convert_dates: dict[Hashable, str] | None = ...,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at the code, only values for str are [ "tc", "%tc", "td", "%td", "tw", "%tw", "tm", "%tm", "tq", "%tq", "th", "%th", "ty", "%ty", ]

write_index: _bool = ...,
byteorder: _str | Literal["<", ">", "little", "big"] | None = ...,
time_stamp=...,
byteorder: Literal["<", ">", "little", "big"] | None = ...,
time_stamp: _dt.datetime | None = ...,
data_label: _str | None = ...,
variable_labels: dict | None = ...,
version: int = ...,
convert_strl: list[_str] | None = ...,
variable_labels: dict[Hashable, str] | None = ...,
version: int | None = ...,
convert_strl: list[HashableT] | None = ...,
compression: CompressionOptions = ...,
storage_options: StorageOptions = ...,
*,
value_labels: dict[Hashable, dict[float, str]] | None = ...,
) -> None: ...
def to_feather(self, path: FilePathOrBuffer, **kwargs) -> None: ...
@overload
Expand Down
92 changes: 35 additions & 57 deletions pandas-stubs/io/stata.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ from typing import (
Hashable,
Literal,
Sequence,
overload,
)

import numpy as np
Expand All @@ -22,6 +23,7 @@ from pandas._typing import (
WriteBuffer,
)

@overload
def read_stata(
path: FilePath | ReadBuffer[bytes],
convert_dates: bool = ...,
Expand All @@ -32,57 +34,46 @@ def read_stata(
columns: list[HashableT] | None = ...,
order_categoricals: bool = ...,
chunksize: int | None = ...,
iterator: bool = ...,
*,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so the second and third overload only different in whether the arguments are keyword-only or can also be provided as positional arguments?

If pandas would have deprecated positional arguments, I would remove the positional overloads - but they are not yet deprecated. (Will try to open a PR for that later today at pandas.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup. They are needed to handle the cases where iterator=True either using a keyword argument or only positional.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I opened pandas-dev/pandas#48128 to make most arguments keyword-only

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

using positional keywords will now be deprecated in 1.5 (except for the first non-self argument)

iterator: Literal[True],
compression: CompressionOptions = ...,
storage_options: StorageOptions = ...,
) -> DataFrame | StataReader: ...

stata_epoch: datetime.datetime = ...
excessive_string_length_error: str
) -> StataReader: ...
@overload
def read_stata(
path: FilePath | ReadBuffer[bytes],
convert_dates: bool,
convert_categoricals: bool,
index_col: str | None,
convert_missing: bool,
preserve_dtypes: bool,
columns: list[HashableT] | None,
order_categoricals: bool,
chunksize: int | None,
iterator: Literal[True],
compression: CompressionOptions = ...,
storage_options: StorageOptions = ...,
) -> StataReader: ...
@overload
def read_stata(
path: FilePath | ReadBuffer[bytes],
convert_dates: bool = ...,
convert_categoricals: bool = ...,
index_col: str | None = ...,
convert_missing: bool = ...,
preserve_dtypes: bool = ...,
columns: list[HashableT] | None = ...,
order_categoricals: bool = ...,
chunksize: int | None = ...,
iterator: Literal[False] = ...,
compression: CompressionOptions = ...,
storage_options: StorageOptions = ...,
) -> DataFrame: ...

class PossiblePrecisionLoss(Warning): ...

precision_loss_doc: str

class ValueLabelTypeMismatch(Warning): ...

value_label_mismatch_doc: str

class InvalidColumnName(Warning): ...

invalid_name_doc: str

class StataValueLabel:
labname: Hashable = ...
value_labels: list[tuple[float, str]] = ...
text_len: int = ...
off: npt.NDArray[np.int32] = ...
val: npt.NDArray[np.int32] = ...
txt: list[bytes] = ...
n: int = ...
len: int = ...
def __init__(
self, catarray: pd.Series, encoding: Literal["latin-1", "utf-8"] = ...
) -> None: ...
def generate_value_label(self, byteorder: str) -> bytes: ...

class StataMissingValue:
MISSING_VALUES: dict[float, str] = ...
bases: tuple[int, int, int] = ...
float32_base: bytes = ...
increment: int = ...
int_value: int = ...
float64_base: bytes = ...
BASE_MISSING_VALUES: dict[str, int] = ...
def __init__(self, value: float) -> None: ...
def __eq__(self, other: object) -> bool: ...
@property
def string(self) -> str: ...
@property
def value(self) -> float: ...
@classmethod
def get_base_missing_value(cls, dtype): ...

class StataParser:
DTYPE_MAP: dict[int, np.dtype] = ...
DTYPE_MAP_XML: dict[int, np.dtype] = ...
Expand Down Expand Up @@ -160,19 +151,6 @@ class StataWriter(StataParser):
) -> None: ...
def write_file(self) -> None: ...

class StataStrLWriter:
df: DataFrame = ...
columns: Sequence[str] = ...
def __init__(
self,
df: DataFrame,
columns: Sequence[str],
version: int = ...,
byteorder: str | None = ...,
) -> None: ...
def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]: ...
def generate_blob(self, gso_table: dict[str, tuple[int, int]]) -> bytes: ...

class StataWriter117(StataWriter):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

StataWriter117 is not public, so we can delete

def __init__(
self,
Expand Down
93 changes: 93 additions & 0 deletions tests/test_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from __future__ import annotations

from contextlib import contextmanager
from pathlib import Path
import tempfile
from typing import (
IO,
Any,
)
import uuid

import pandas as pd
from pandas import DataFrame
from typing_extensions import assert_type

from tests import check

from pandas.io.stata import (
StataReader,
read_stata,
)

DF = DataFrame({"a": [1, 2, 3], "b": [0.0, 0.0, 0.0]})


@contextmanager
def ensure_clean(filename=None, return_filelike: bool = False, **kwargs: Any):
"""
Gets a temporary path and agrees to remove on close.
This implementation does not use tempfile.mkstemp to avoid having a file handle.
If the code using the returned path wants to delete the file itself, windows
requires that no program has a file handle to it.
Parameters
----------
filename : str (optional)
suffix of the created file.
return_filelike : bool (default False)
if True, returns a file-like which is *always* cleaned. Necessary for
savefig and other functions which want to append extensions.
**kwargs
Additional keywords are passed to open().
"""
folder = Path(tempfile.gettempdir())

if filename is None:
filename = ""
filename = str(uuid.uuid4()) + filename
path = folder / filename

path.touch()

handle_or_str: str | IO = str(path)
if return_filelike:
kwargs.setdefault("mode", "w+b")
handle_or_str = open(path, **kwargs)

try:
yield handle_or_str
finally:
if not isinstance(handle_or_str, str):
handle_or_str.close()
if path.is_file():
path.unlink()


def test_read_stata_df():
with ensure_clean() as path:
DF.to_stata(path)
check(assert_type(read_stata(path), pd.DataFrame), pd.DataFrame)


def test_read_stata_iterator_positional():
with ensure_clean() as path:
str_path = str(path)
DF.to_stata(str_path)
check(
assert_type(
read_stata(
str_path, False, False, None, False, False, None, False, 2, True
),
StataReader,
),
StataReader,
)


def test_read_stata_iterator():
with ensure_clean() as path:
str_path = str(path)
DF.to_stata(str_path)
check(
assert_type(read_stata(str_path, iterator=True), StataReader), StataReader
)