Skip to content

TYPING: some type hints for pandas\io\common.py #27598

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Aug 2, 2019
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 40 additions & 23 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import mmap
import os
import pathlib
from typing import IO, BinaryIO, Optional, TextIO, Tuple, Type
from urllib.error import URLError # noqa
from urllib.parse import ( # noqa
urlencode,
Expand All @@ -32,6 +33,8 @@

from pandas.core.dtypes.common import is_file_like

from pandas._typing import FilePathOrBuffer

# gh-12665: Alias for now and remove later.
CParserError = ParserError

Expand Down Expand Up @@ -68,14 +71,14 @@ class BaseIterator:
Useful only when the object being iterated is non-reusable (e.g. OK for a
parser, not for an in-memory table, yes for its iterator)."""

def __iter__(self):
def __iter__(self) -> "BaseIterator":
return self

def __next__(self):
raise AbstractMethodError(self)


def _is_url(url):
def _is_url(url) -> bool:
"""Check to see if a URL has a valid protocol.

Parameters
Expand All @@ -93,7 +96,7 @@ def _is_url(url):
return False


def _expand_user(filepath_or_buffer):
def _expand_user(filepath_or_buffer: FilePathOrBuffer) -> FilePathOrBuffer:
"""Return the argument with an initial component of ~ or ~user
replaced by that user's home directory.

Expand All @@ -111,7 +114,7 @@ def _expand_user(filepath_or_buffer):
return filepath_or_buffer


def _validate_header_arg(header):
def _validate_header_arg(header) -> None:
if isinstance(header, bool):
raise TypeError(
"Passing a bool to header is invalid. "
Expand All @@ -121,7 +124,7 @@ def _validate_header_arg(header):
)


def _stringify_path(filepath_or_buffer):
def _stringify_path(filepath_or_buffer: FilePathOrBuffer) -> FilePathOrBuffer:
"""Attempt to convert a path-like object to a string.

Parameters
Expand All @@ -144,21 +147,22 @@ def _stringify_path(filepath_or_buffer):
strings, buffers, or anything else that's not even path-like.
"""
if hasattr(filepath_or_buffer, "__fspath__"):
return filepath_or_buffer.__fspath__()
# https://github.com/python/mypy/issues/1424
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When we drop 3.5 support do you think we can just do `isinstance(filepath_or_buffer, os.PathLike) here instead?

return filepath_or_buffer.__fspath__() # type: ignore
elif isinstance(filepath_or_buffer, pathlib.Path):
return str(filepath_or_buffer)
return _expand_user(filepath_or_buffer)


def is_s3_url(url):
def is_s3_url(url) -> bool:
"""Check for an s3, s3n, or s3a url"""
try:
return parse_url(url).scheme in ["s3", "s3n", "s3a"]
except Exception:
return False


def is_gcs_url(url):
def is_gcs_url(url) -> bool:
"""Check for a gcs url"""
try:
return parse_url(url).scheme in ["gcs", "gs"]
Expand All @@ -167,8 +171,11 @@ def is_gcs_url(url):


def get_filepath_or_buffer(
filepath_or_buffer, encoding=None, compression=None, mode=None
):
filepath_or_buffer: FilePathOrBuffer,
encoding: Optional[str] = None,
compression: Optional[str] = None,
mode: Optional[str] = None,
) -> Tuple[FilePathOrBuffer, Optional[str], Optional[str], bool]:
"""
If the filepath_or_buffer is a url, translate and return the buffer.
Otherwise passthrough.
Expand All @@ -190,7 +197,7 @@ def get_filepath_or_buffer(
"""
filepath_or_buffer = _stringify_path(filepath_or_buffer)

if _is_url(filepath_or_buffer):
if isinstance(filepath_or_buffer, str) and _is_url(filepath_or_buffer):
req = urlopen(filepath_or_buffer)
content_encoding = req.headers.get("Content-Encoding", None)
if content_encoding == "gzip":
Expand Down Expand Up @@ -224,7 +231,7 @@ def get_filepath_or_buffer(
return filepath_or_buffer, None, compression, False


def file_path_to_url(path):
def file_path_to_url(path: str) -> str:
"""
converts an absolute native path to a FILE URL.

Expand All @@ -242,7 +249,9 @@ def file_path_to_url(path):
_compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"}


def _infer_compression(filepath_or_buffer, compression):
def _infer_compression(
filepath_or_buffer: FilePathOrBuffer, compression: Optional[str]
) -> Optional[str]:
"""
Get the compression method for filepath_or_buffer. If compression='infer',
the inferred compression method is returned. Otherwise, the input
Expand Down Expand Up @@ -435,7 +444,13 @@ class BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore
"""

# GH 17778
def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs):
def __init__(
self,
file: FilePathOrBuffer,
mode: str,
compression: int = zipfile.ZIP_DEFLATED,
**kwargs
):
if mode in ["wb", "rb"]:
mode = mode.replace("b", "")
super().__init__(file, mode, compression, **kwargs)
Expand All @@ -461,16 +476,16 @@ class MMapWrapper(BaseIterator):

"""

def __init__(self, f):
def __init__(self, f: IO):
self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)

def __getattr__(self, name):
def __getattr__(self, name: str):
return getattr(self.mmap, name)

def __iter__(self):
def __iter__(self) -> "MMapWrapper":
return self

def __next__(self):
def __next__(self) -> str:
newline = self.mmap.readline()

# readline returns bytes, not str, but Python's CSV reader
Expand All @@ -491,16 +506,16 @@ class UTF8Recoder(BaseIterator):
Iterator that reads an encoded stream and re-encodes the input to UTF-8
"""

def __init__(self, f, encoding):
def __init__(self, f: BinaryIO, encoding: str):
self.reader = codecs.getreader(encoding)(f)

def read(self, bytes=-1):
def read(self, bytes: int = -1) -> bytes:
return self.reader.read(bytes).encode("utf-8")

def readline(self):
def readline(self) -> bytes:
return self.reader.readline().encode("utf-8")

def next(self):
def next(self) -> bytes:
return next(self.reader).encode("utf-8")


Expand All @@ -511,5 +526,7 @@ def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
return csv.reader(f, dialect=dialect, **kwds)


def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
def UnicodeWriter(
f: TextIO, dialect: Type[csv.Dialect] = csv.excel, encoding: str = "utf-8", **kwds
):
return csv.writer(f, dialect=dialect, **kwds)
9 changes: 9 additions & 0 deletions pandas/io/formats/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import re
from shutil import get_terminal_size
from typing import (
IO,
TYPE_CHECKING,
Any,
Callable,
Expand Down Expand Up @@ -730,6 +731,11 @@ def to_string(self) -> None:
"""
Render a DataFrame to a console-friendly tabular output.
"""
# Note: the to_string method only accepts IO whereas to_html and
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the notes - are these bugs?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are the sort of inconsistencies between to_html, to_string and to_latex that I was hoping to find by adding type hints.

This should help with refactoring the three methods to share buffer handling code.

Can't really call them bugs when the documentation doesn't say that filenames are accepted (even though they are for to_html and to_latex).

buf : StringIO-like, optional
Buffer to write to.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know what the lifecycle of self.buf = cast(...) is? Would that cast still just stay local to the function?

# to_latex accept FilePathOrBuffer, will raise
# AttributeError: 'str' object has no attribute 'writelines'
self.buf = cast(IO, self.buf)

from pandas import Series

frame = self.frame
Expand Down Expand Up @@ -902,6 +908,9 @@ def to_html(
Klass = NotebookFormatter if notebook else HTMLFormatter
html = Klass(self, classes=classes, border=border).render()
if hasattr(self.buf, "write"):
# Note: only TextIO is supported, a BytesIO object will raise
# TypeError: a bytes-like object is required, not 'str'
self.buf = cast(TextIO, self.buf)
buffer_put_lines(self.buf, html)
elif isinstance(self.buf, str):
with open(self.buf, "w") as f:
Expand Down
4 changes: 3 additions & 1 deletion pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,9 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):

if should_close:
try:
fp_or_buf.close()
# error: Item "str" of "Union[str, Path, IO[Any]]" has no attribute "close"
# error: Item "Path" of "Union[str, Path, IO[Any]]" has no attribute "close"
fp_or_buf.close() # type: ignore
except ValueError:
pass

Expand Down