Skip to content

TYPING: some type hints for pandas\io\common.py #27598

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Aug 2, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 4 additions & 11 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -732,7 +732,6 @@ def to_string(

formatter = fmt.DataFrameFormatter(
self,
buf=buf,
columns=columns,
col_space=col_space,
na_rep=na_rep,
Expand All @@ -750,11 +749,7 @@ def to_string(
decimal=decimal,
line_width=line_width,
)
formatter.to_string()

if buf is None:
result = formatter.buf.getvalue()
return result
return formatter.to_string(buf=buf)

# ----------------------------------------------------------------------

Expand Down Expand Up @@ -2273,7 +2268,6 @@ def to_html(

formatter = fmt.DataFrameFormatter(
self,
buf=buf,
columns=columns,
col_space=col_space,
na_rep=na_rep,
Expand All @@ -2294,10 +2288,9 @@ def to_html(
render_links=render_links,
)
# TODO: a generic formatter wld b in DataFrameFormatter
formatter.to_html(classes=classes, notebook=notebook, border=border)

if buf is None:
return formatter.buf.getvalue()
return formatter.to_html(
buf=buf, classes=classes, notebook=notebook, border=border
)

# ----------------------------------------------------------------------

Expand Down
7 changes: 2 additions & 5 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3018,7 +3018,6 @@ def to_latex(

formatter = DataFrameFormatter(
self,
buf=buf,
columns=columns,
col_space=col_space,
na_rep=na_rep,
Expand All @@ -3032,7 +3031,8 @@ def to_latex(
escape=escape,
decimal=decimal,
)
formatter.to_latex(
return formatter.to_latex(
buf=buf,
column_format=column_format,
longtable=longtable,
encoding=encoding,
Expand All @@ -3041,9 +3041,6 @@ def to_latex(
multirow=multirow,
)

if buf is None:
return formatter.buf.getvalue()

def to_csv(
self,
path_or_buf=None,
Expand Down
65 changes: 43 additions & 22 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import mmap
import os
import pathlib
from typing import IO, AnyStr, BinaryIO, Optional, TextIO, Type
from urllib.error import URLError # noqa
from urllib.parse import ( # noqa
urlencode,
Expand All @@ -32,6 +33,8 @@

from pandas.core.dtypes.common import is_file_like

from pandas._typing import FilePathOrBuffer

# gh-12665: Alias for now and remove later.
CParserError = ParserError

Expand Down Expand Up @@ -68,14 +71,14 @@ class BaseIterator:
Useful only when the object being iterated is non-reusable (e.g. OK for a
parser, not for an in-memory table, yes for its iterator)."""

def __iter__(self):
def __iter__(self) -> "BaseIterator":
return self

def __next__(self):
raise AbstractMethodError(self)


def _is_url(url):
def _is_url(url) -> bool:
"""Check to see if a URL has a valid protocol.

Parameters
Expand All @@ -93,7 +96,9 @@ def _is_url(url):
return False


def _expand_user(filepath_or_buffer):
def _expand_user(
filepath_or_buffer: FilePathOrBuffer[AnyStr]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does subscripting with AnyStr do here? Add to Union?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FilePathOrBuffer is a just a Union. FilePathOrBuffer[AnyStr] with subscription is effectively a TypeVar in IO. so IO[str] can't become IO[bytes].

i think https://mypy.readthedocs.io/en/latest/generics.html#generic-type-aliases explains it. will send a different link if i come across a better explanation.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's pretty cool - thanks for sharing! Does this really change anything though? We already have IO[AnyStr] in FilePathOrBuffer so this just restates that (?)

Unrelated note - IO[AnyStr] might itself be wrong as AnyStr is a TypeVar and I think we need to parametrize IO with the actual type. I find the Python docs rather confusing on that so maybe we remove AnyStr altogether but can level that as a separate exercise (unless it helps simplify annotation here)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unrelated note - IO[AnyStr] might itself be wrong as AnyStr is a TypeVar and I think we need to parametrize IO with the actual type.

The alias is a Union, the Union has one and only one Generic. so parametrising the alias is parametrising the only Generic in the Union, i.e. IO.

We already have IO[AnyStr] in FilePathOrBuffer so this just restates that (?)

I don't think AnyStr is treated as a TypeVar inside the union when the alias is defined. so that's why it's needed in use

Does this really change anything though?

yes. mypy will fail without it.

in to_html etc only string buffers are supported. hence Optional[FilePathOrBuffer[str]] is used (note the parametrisation of FilePathOrBuffer here) and the TypeVar then becomes necessary otherwise a bytes buffer could be returned.

buffer_put_lines(buf: IO[str], lines: List[str]) -> None only supports string buffers. so mypy will raise if we don't use TypeVars to maintain the FilePathOrBuffer type in and out of the common functions.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right I think the std documentation isn't very clear but just defining IO creates a generic accepting a type of AnyStr which is str / bytes. This is in contrast to other generics that really accept type T (essentially anything). You can see this if you try to inject a non-str or bytes type

from typing import IO

foo: IO[int]

yields

error: Value of type variable "AnyStr" of "IO" cannot be "int"

So I think an error to keep re-parametrizing IO with AnyStr in the _typing module and here.

Let's leave to a follow up to clean up though

) -> FilePathOrBuffer[AnyStr]:
"""Return the argument with an initial component of ~ or ~user
replaced by that user's home directory.

Expand All @@ -111,7 +116,7 @@ def _expand_user(filepath_or_buffer):
return filepath_or_buffer


def _validate_header_arg(header):
def _validate_header_arg(header) -> None:
if isinstance(header, bool):
raise TypeError(
"Passing a bool to header is invalid. "
Expand All @@ -121,7 +126,9 @@ def _validate_header_arg(header):
)


def _stringify_path(filepath_or_buffer):
def _stringify_path(
filepath_or_buffer: FilePathOrBuffer[AnyStr]
) -> FilePathOrBuffer[AnyStr]:
"""Attempt to convert a path-like object to a string.

Parameters
Expand All @@ -144,21 +151,22 @@ def _stringify_path(filepath_or_buffer):
strings, buffers, or anything else that's not even path-like.
"""
if hasattr(filepath_or_buffer, "__fspath__"):
return filepath_or_buffer.__fspath__()
# https://github.com/python/mypy/issues/1424
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When we drop 3.5 support do you think we can just do `isinstance(filepath_or_buffer, os.PathLike) here instead?

return filepath_or_buffer.__fspath__() # type: ignore
elif isinstance(filepath_or_buffer, pathlib.Path):
return str(filepath_or_buffer)
return _expand_user(filepath_or_buffer)


def is_s3_url(url):
def is_s3_url(url) -> bool:
"""Check for an s3, s3n, or s3a url"""
try:
return parse_url(url).scheme in ["s3", "s3n", "s3a"]
except Exception:
return False


def is_gcs_url(url):
def is_gcs_url(url) -> bool:
"""Check for a gcs url"""
try:
return parse_url(url).scheme in ["gcs", "gs"]
Expand All @@ -167,7 +175,10 @@ def is_gcs_url(url):


def get_filepath_or_buffer(
filepath_or_buffer, encoding=None, compression=None, mode=None
filepath_or_buffer: FilePathOrBuffer,
encoding: Optional[str] = None,
compression: Optional[str] = None,
mode: Optional[str] = None,
):
"""
If the filepath_or_buffer is a url, translate and return the buffer.
Expand All @@ -190,7 +201,7 @@ def get_filepath_or_buffer(
"""
filepath_or_buffer = _stringify_path(filepath_or_buffer)

if _is_url(filepath_or_buffer):
if isinstance(filepath_or_buffer, str) and _is_url(filepath_or_buffer):
req = urlopen(filepath_or_buffer)
content_encoding = req.headers.get("Content-Encoding", None)
if content_encoding == "gzip":
Expand Down Expand Up @@ -224,7 +235,7 @@ def get_filepath_or_buffer(
return filepath_or_buffer, None, compression, False


def file_path_to_url(path):
def file_path_to_url(path: str) -> str:
"""
converts an absolute native path to a FILE URL.

Expand All @@ -242,7 +253,9 @@ def file_path_to_url(path):
_compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"}


def _infer_compression(filepath_or_buffer, compression):
def _infer_compression(
filepath_or_buffer: FilePathOrBuffer, compression: Optional[str]
) -> Optional[str]:
"""
Get the compression method for filepath_or_buffer. If compression='infer',
the inferred compression method is returned. Otherwise, the input
Expand Down Expand Up @@ -435,7 +448,13 @@ class BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore
"""

# GH 17778
def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs):
def __init__(
self,
file: FilePathOrBuffer,
mode: str,
compression: int = zipfile.ZIP_DEFLATED,
**kwargs
):
if mode in ["wb", "rb"]:
mode = mode.replace("b", "")
super().__init__(file, mode, compression, **kwargs)
Expand All @@ -461,16 +480,16 @@ class MMapWrapper(BaseIterator):

"""

def __init__(self, f):
def __init__(self, f: IO):
self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)

def __getattr__(self, name):
def __getattr__(self, name: str):
return getattr(self.mmap, name)

def __iter__(self):
def __iter__(self) -> "MMapWrapper":
return self

def __next__(self):
def __next__(self) -> str:
newline = self.mmap.readline()

# readline returns bytes, not str, but Python's CSV reader
Expand All @@ -491,16 +510,16 @@ class UTF8Recoder(BaseIterator):
Iterator that reads an encoded stream and re-encodes the input to UTF-8
"""

def __init__(self, f, encoding):
def __init__(self, f: BinaryIO, encoding: str):
self.reader = codecs.getreader(encoding)(f)

def read(self, bytes=-1):
def read(self, bytes: int = -1) -> bytes:
return self.reader.read(bytes).encode("utf-8")

def readline(self):
def readline(self) -> bytes:
return self.reader.readline().encode("utf-8")

def next(self):
def next(self) -> bytes:
return next(self.reader).encode("utf-8")


Expand All @@ -511,5 +530,7 @@ def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
return csv.reader(f, dialect=dialect, **kwds)


def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
def UnicodeWriter(
f: TextIO, dialect: Type[csv.Dialect] = csv.excel, encoding: str = "utf-8", **kwds
):
return csv.writer(f, dialect=dialect, **kwds)
Loading