Skip to content

ENH: Synchronize io/stata with pandas master #202

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Aug 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions pandas-stubs/_typing.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,23 @@ GroupByObjectNonScalar = Union[
]
GroupByObject = Union[Scalar, GroupByObjectNonScalar]

StataDateFormat = Literal[
"tc",
"%tc",
"td",
"%td",
"tw",
"%tw",
"tm",
"%tm",
"tq",
"%tq",
"th",
"%th",
"ty",
"%ty",
]

FillnaOptions = Literal["backfill", "bfill", "ffill", "pad"]
ReplaceMethod = Literal["pad", "ffill", "bfill"]
SortKind = Literal["quicksort", "mergesort", "heapsort", "stable"]
Expand Down
20 changes: 14 additions & 6 deletions pandas-stubs/core/frame.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ from pandas._typing import (
CompressionOptions,
Dtype,
DtypeNp,
FilePath,
FilePathOrBuffer,
FilePathOrBytesBuffer,
FillnaOptions,
Expand All @@ -74,9 +75,12 @@ from pandas._typing import (
ScalarT,
SeriesAxisType,
SortKind,
StataDateFormat,
StorageOptions,
StrLike,
T as TType,
TimestampConvention,
WriteBuffer,
np_ndarray_bool,
np_ndarray_str,
num,
Expand Down Expand Up @@ -245,15 +249,19 @@ class DataFrame(NDFrame, OpsMixin):
) -> np.recarray: ...
def to_stata(
self,
path: FilePathOrBuffer,
convert_dates: dict | None = ...,
path: FilePath | WriteBuffer[bytes],
convert_dates: dict[HashableT, StataDateFormat] | None = ...,
write_index: _bool = ...,
byteorder: Literal["<", ">", "little", "big"] | None = ...,
time_stamp=...,
time_stamp: _dt.datetime | None = ...,
data_label: _str | None = ...,
variable_labels: dict | None = ...,
version: int = ...,
convert_strl: list[_str] | None = ...,
variable_labels: dict[HashableT, str] | None = ...,
version: Literal[114, 117, 118, 119] | None = ...,
convert_strl: list[HashableT] | None = ...,
compression: CompressionOptions = ...,
storage_options: StorageOptions = ...,
*,
value_labels: dict[Hashable, dict[float, str]] | None = ...,
) -> None: ...
def to_feather(self, path: FilePathOrBuffer, **kwargs) -> None: ...
@overload
Expand Down
150 changes: 39 additions & 111 deletions pandas-stubs/io/stata.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ import datetime
from io import BytesIO
from types import TracebackType
from typing import (
Hashable,
Literal,
Sequence,
overload,
)

import numpy as np
Expand All @@ -18,10 +18,12 @@ from pandas._typing import (
FilePath,
HashableT,
ReadBuffer,
StataDateFormat,
StorageOptions,
WriteBuffer,
)

@overload
def read_stata(
path: FilePath | ReadBuffer[bytes],
convert_dates: bool = ...,
Expand All @@ -32,70 +34,47 @@ def read_stata(
columns: list[HashableT] | None = ...,
order_categoricals: bool = ...,
chunksize: int | None = ...,
iterator: bool = ...,
*,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so the second and third overload only different in whether the arguments are keyword-only or can also be provided as positional arguments?

If pandas would have deprecated positional arguments, I would remove the positional overloads - but they are not yet deprecated. (Will try to open a PR for that later today at pandas.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup. They are needed to handle the cases where iterator=True either using a keyword argument or only positional.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I opened pandas-dev/pandas#48128 to make most arguments keyword-only

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

using positional keywords will now be deprecated in 1.5 (except for the first non-self argument)

iterator: Literal[True],
compression: CompressionOptions = ...,
storage_options: StorageOptions = ...,
) -> DataFrame | StataReader: ...

stata_epoch: datetime.datetime = ...
excessive_string_length_error: str
) -> StataReader: ...
@overload
def read_stata(
path: FilePath | ReadBuffer[bytes],
convert_dates: bool,
convert_categoricals: bool,
index_col: str | None,
convert_missing: bool,
preserve_dtypes: bool,
columns: list[HashableT] | None,
order_categoricals: bool,
chunksize: int | None,
iterator: Literal[True],
compression: CompressionOptions = ...,
storage_options: StorageOptions = ...,
) -> StataReader: ...
@overload
def read_stata(
path: FilePath | ReadBuffer[bytes],
convert_dates: bool = ...,
convert_categoricals: bool = ...,
index_col: str | None = ...,
convert_missing: bool = ...,
preserve_dtypes: bool = ...,
columns: list[HashableT] | None = ...,
order_categoricals: bool = ...,
chunksize: int | None = ...,
iterator: Literal[False] = ...,
compression: CompressionOptions = ...,
storage_options: StorageOptions = ...,
) -> DataFrame: ...

class PossiblePrecisionLoss(Warning): ...

precision_loss_doc: str

class ValueLabelTypeMismatch(Warning): ...

value_label_mismatch_doc: str

class InvalidColumnName(Warning): ...

invalid_name_doc: str

class StataValueLabel:
labname: Hashable = ...
value_labels: list[tuple[float, str]] = ...
text_len: int = ...
off: npt.NDArray[np.int32] = ...
val: npt.NDArray[np.int32] = ...
txt: list[bytes] = ...
n: int = ...
len: int = ...
def __init__(
self, catarray: pd.Series, encoding: Literal["latin-1", "utf-8"] = ...
) -> None: ...
def generate_value_label(self, byteorder: str) -> bytes: ...

class StataMissingValue:
MISSING_VALUES: dict[float, str] = ...
bases: tuple[int, int, int] = ...
float32_base: bytes = ...
increment: int = ...
int_value: int = ...
float64_base: bytes = ...
BASE_MISSING_VALUES: dict[str, int] = ...
def __init__(self, value: float) -> None: ...
def __eq__(self, other: object) -> bool: ...
@property
def string(self) -> str: ...
@property
def value(self) -> float: ...
@classmethod
def get_base_missing_value(cls, dtype): ...

class StataParser:
DTYPE_MAP: dict[int, np.dtype] = ...
DTYPE_MAP_XML: dict[int, np.dtype] = ...
TYPE_MAP: list[tuple[int | str, ...]] = ...
TYPE_MAP_XML: dict[int, str] = ...
VALID_RANGE: dict[
str,
tuple[int, int] | tuple[np.float32, np.float32] | tuple[np.float64, np.float64],
] = ...
OLD_TYPE_MAPPING: dict[int, int] = ...
MISSING_VALUES: dict[str, int] = ...
NUMPY_TYPE_MAP: dict[str, str] = ...
RESERVED_WORDS: tuple[str, ...] = ...
def __init__(self) -> None: ...

class StataReader(StataParser, abc.Iterator):
Expand Down Expand Up @@ -142,70 +121,19 @@ class StataReader(StataParser, abc.Iterator):
def value_labels(self) -> dict[str, dict[float, str]]: ...

class StataWriter(StataParser):
type_converters: dict[str, type[np.dtype]] = ...
def __init__(
self,
fname: FilePath | WriteBuffer[bytes],
data: DataFrame,
convert_dates: dict[Hashable, str] | None = ...,
convert_dates: dict[HashableT, StataDateFormat] | None = ...,
write_index: bool = ...,
byteorder: str | None = ...,
time_stamp: datetime.datetime | None = ...,
data_label: str | None = ...,
variable_labels: dict[Hashable, str] | None = ...,
variable_labels: dict[HashableT, str] | None = ...,
compression: CompressionOptions = ...,
storage_options: StorageOptions = ...,
*,
value_labels: dict[Hashable, dict[float, str]] | None = ...,
value_labels: dict[HashableT, dict[float, str]] | None = ...,
) -> None: ...
def write_file(self) -> None: ...

class StataStrLWriter:
df: DataFrame = ...
columns: Sequence[str] = ...
def __init__(
self,
df: DataFrame,
columns: Sequence[str],
version: int = ...,
byteorder: str | None = ...,
) -> None: ...
def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]: ...
def generate_blob(self, gso_table: dict[str, tuple[int, int]]) -> bytes: ...

class StataWriter117(StataWriter):
def __init__(
self,
fname: FilePath | WriteBuffer[bytes],
data: DataFrame,
convert_dates: dict[Hashable, str] | None = ...,
write_index: bool = ...,
byteorder: str | None = ...,
time_stamp: datetime.datetime | None = ...,
data_label: str | None = ...,
variable_labels: dict[Hashable, str] | None = ...,
convert_strl: Sequence[Hashable] | None = ...,
compression: CompressionOptions = ...,
storage_options: StorageOptions = ...,
*,
value_labels: dict[Hashable, dict[float, str]] | None = ...,
) -> None: ...

class StataWriterUTF8(StataWriter117):
def __init__(
self,
fname: FilePath | WriteBuffer[bytes],
data: DataFrame,
convert_dates: dict[Hashable, str] | None = ...,
write_index: bool = ...,
byteorder: str | None = ...,
time_stamp: datetime.datetime | None = ...,
data_label: str | None = ...,
variable_labels: dict[Hashable, str] | None = ...,
convert_strl: Sequence[Hashable] | None = ...,
version: int | None = ...,
compression: CompressionOptions = ...,
storage_options: StorageOptions = ...,
*,
value_labels: dict[Hashable, dict[float, str]] | None = ...,
) -> None: ...
34 changes: 34 additions & 0 deletions tests/test_io.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,52 @@
import pandas as pd
from pandas import (
DataFrame,
read_clipboard,
read_stata,
)
from pandas._testing import ensure_clean
import pytest
from typing_extensions import assert_type

from tests import check

from pandas.io.clipboard import PyperclipException
from pandas.io.parsers import TextFileReader
from pandas.io.stata import StataReader

DF = DataFrame({"a": [1, 2, 3], "b": [0.0, 0.0, 0.0]})


def test_read_stata_df():
with ensure_clean() as path:
DF.to_stata(path)
check(assert_type(read_stata(path), pd.DataFrame), pd.DataFrame)


def test_read_stata_iterator_positional():
with ensure_clean() as path:
str_path = str(path)
DF.to_stata(str_path)
check(
assert_type(
read_stata(
str_path, False, False, None, False, False, None, False, 2, True
),
StataReader,
),
StataReader,
)


def test_read_stata_iterator():
with ensure_clean() as path:
str_path = str(path)
DF.to_stata(str_path)
check(
assert_type(read_stata(str_path, iterator=True), StataReader), StataReader
)


def test_clipboard():
try:
DF.to_clipboard()
Expand Down