Merge remote-tracking branch 'upstream/master' into series_rolling_count_ignores_min_periods

fujiaxiang · fujiaxiang · commit b0f5baa440e9 · 2020-01-19T11:13:08.000+08:00
diff --git a/.devcontainer.json b/.devcontainer.json
@@ -0,0 +1,28 @@
+// For format details, see https://aka.ms/vscode-remote/devcontainer.json or the definition README at
+// https://github.com/microsoft/vscode-dev-containers/tree/master/containers/python-3-miniconda
+{
+	"name": "pandas",
+	"context": ".",
+	"dockerFile": "Dockerfile",
+
+	// Use 'settings' to set *default* container specific settings.json values on container create.
+	// You can edit these settings after create using File > Preferences > Settings > Remote.
+	"settings": {
+		"terminal.integrated.shell.linux": "/bin/bash",
+		"python.condaPath": "/opt/conda/bin/conda",
+		"python.pythonPath": "/opt/conda/bin/python",
+		"python.formatting.provider": "black",
+		"python.linting.enabled": true,
+		"python.linting.flake8Enabled": true,
+		"python.linting.pylintEnabled": false,
+		"python.linting.mypyEnabled": true,
+		"python.testing.pytestEnabled": true,
+		"python.testing.cwd": "pandas/tests"
+	},
+
+	// Add the IDs of extensions you want installed when the container is created in the array below.
+	"extensions": [
+		"ms-python.python",
+		"ms-vscode.cpptools"
+	]
+}
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,47 @@
+FROM continuumio/miniconda3
+
+# if you forked pandas, you can pass in your own GitHub username to use your fork
+# i.e. gh_username=myname
+ARG gh_username=pandas-dev
+ARG pandas_home="/home/pandas"
+
+# Avoid warnings by switching to noninteractive
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Configure apt and install packages
+RUN apt-get update \
+    && apt-get -y install --no-install-recommends apt-utils dialog 2>&1 \
+    #
+    # Verify git, process tools, lsb-release (common in install instructions for CLIs) installed
+    && apt-get -y install git iproute2 procps iproute2 lsb-release \
+    #
+    # Install C compilers (gcc not enough, so just went with build-essential which admittedly might be overkill),
+    # needed to build pandas C extensions
+    && apt-get -y install build-essential \
+    #
+    # cleanup
+    && apt-get autoremove -y \
+    && apt-get clean -y \
+    && rm -rf /var/lib/apt/lists/*
+
+# Switch back to dialog for any ad-hoc use of apt-get
+ENV DEBIAN_FRONTEND=dialog
+
+# Clone pandas repo
+RUN mkdir "$pandas_home" \
+    && git clone "https://github.com/$gh_username/pandas.git" "$pandas_home" \
+    && cd "$pandas_home" \
+    && git remote add upstream "https://github.com/pandas-dev/pandas.git" \
+    && git pull upstream master
+
+# Because it is surprisingly difficult to activate a conda environment inside a DockerFile
+# (from personal experience and per https://github.com/ContinuumIO/docker-images/issues/89),
+# we just update the base/root one from the 'environment.yml' file instead of creating a new one.
+#
+# Set up environment
+RUN conda env update -n base -f "$pandas_home/environment.yml"
+
+# Build C extensions and pandas
+RUN cd "$pandas_home" \
+    && python setup.py build_ext --inplace -j 4 \
+    && python -m pip install -e .
diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst
@@ -146,6 +146,17 @@ requires a C compiler and Python environment. If you're making documentation
 changes, you can skip to :ref:`contributing.documentation` but you won't be able
 to build the documentation locally before pushing your changes.
 
+Using a Docker Container
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Instead of manually setting up a development environment, you can use Docker to
+automatically create the environment with just several commands. Pandas provides a `DockerFile`
+in the root directory to build a Docker image with a full pandas development environment.
+
+Even easier, you can use the DockerFile to launch a remote session with Visual Studio Code,
+a popular free IDE, using the `.devcontainer.json` file.
+See https://code.visualstudio.com/docs/remote/containers for details.
+
 .. _contributing.dev_c:
 
 Installing a C compiler
diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
@@ -122,16 +122,14 @@ also goes beyond matplotlib and pandas with the option to perform statistical
 estimation while plotting, aggregating across observations and visualizing the
 fit of statistical models to emphasize patterns in a dataset.
 
-`yhat/ggpy <https://github.com/yhat/ggpy>`__
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+`plotnine <https://github.com/has2k1/plotnine/>`__
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Hadley Wickham's `ggplot2 <https://ggplot2.tidyverse.org/>`__ is a foundational exploratory visualization package for the R language.
 Based on `"The Grammar of Graphics" <https://www.cs.uic.edu/~wilkinson/TheGrammarOfGraphics/GOG.html>`__ it
 provides a powerful, declarative and extremely general way to generate bespoke plots of any kind of data.
-It's really quite incredible. Various implementations to other languages are available,
-but a faithful implementation for Python users has long been missing. Although still young
-(as of Jan-2014), the `yhat/ggpy <https://github.com/yhat/ggpy>`__ project has been
-progressing quickly in that direction.
+Various implementations to other languages are available.
+A good implementation for Python users is `has2k1/plotnine <https://github.com/has2k1/plotnine/>`__.
 
 `IPython Vega <https://github.com/vega/ipyvega>`__
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -4639,7 +4639,8 @@ def get_value(self, series, key):
 
         k = self._convert_scalar_indexer(key, kind="getitem")
         try:
-            return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
+            loc = self._engine.get_loc(k)
+
         except KeyError as e1:
             if len(self) > 0 and (self.holds_integer() or self.is_boolean()):
                 raise
@@ -4648,19 +4649,17 @@ def get_value(self, series, key):
                 return libindex.get_value_at(s, key)
             except IndexError:
                 raise
-            except TypeError:
-                # generator/iterator-like
-                if is_iterator(key):
-                    raise InvalidIndexError(key)
-                else:
-                    raise e1
             except Exception:
                 raise e1
         except TypeError:
             # e.g. "[False] is an invalid key"
-            if is_scalar(key):
-                raise IndexError(key)
-            raise InvalidIndexError(key)
+            raise IndexError(key)
+
+        else:
+            if is_scalar(loc):
+                tz = getattr(series.dtype, "tz", None)
+                return libindex.get_value_at(s, loc, tz=tz)
+            return series.iloc[loc]
 
     def set_value(self, arr, key, value):
         """
diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py
@@ -488,17 +488,18 @@ def __contains__(self, other) -> bool:
 
     @Appender(_index_shared_docs["get_loc"])
     def get_loc(self, key, method=None, tolerance=None):
-        try:
-            if np.all(np.isnan(key)) or is_bool(key):
-                nan_idxs = self._nan_idxs
-                try:
-                    return nan_idxs.item()
-                except ValueError:
-                    if not len(nan_idxs):
-                        raise KeyError(key)
-                    return nan_idxs
-        except (TypeError, NotImplementedError):
-            pass
+        if is_bool(key):
+            # Catch this to avoid accidentally casting to 1.0
+            raise KeyError(key)
+
+        if is_float(key) and np.isnan(key):
+            nan_idxs = self._nan_idxs
+            if not len(nan_idxs):
+                raise KeyError(key)
+            elif len(nan_idxs) == 1:
+                return nan_idxs[0]
+            return nan_idxs
+
         return super().get_loc(key, method=method, tolerance=tolerance)
 
     @cache_readonly
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -815,18 +815,6 @@ def __getitem__(self, key):
         try:
             result = self.index.get_value(self, key)
 
-            if not is_scalar(result):
-                if is_list_like(result) and not isinstance(result, Series):
-
-                    # we need to box if loc of the key isn't scalar here
-                    # otherwise have inline ndarray/lists
-                    try:
-                        if not is_scalar(self.index.get_loc(key)):
-                            result = self._constructor(
-                                result, index=[key] * len(result), dtype=self.dtype
-                            ).__finalize__(self)
-                    except KeyError:
-                        pass
             return result
         except InvalidIndexError:
             pass
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -591,9 +591,14 @@ def _setup_build_doc(self):
     def _build_doc(self):
         from bs4 import BeautifulSoup
 
-        return BeautifulSoup(
-            self._setup_build_doc(), features="html5lib", from_encoding=self.encoding
-        )
+        bdoc = self._setup_build_doc()
+        if isinstance(bdoc, bytes) and self.encoding is not None:
+            udoc = bdoc.decode(self.encoding)
+            from_encoding = None
+        else:
+            udoc = bdoc
+            from_encoding = self.encoding
+        return BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
 
 
 def _build_xpath_expr(attrs) -> str:
diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py
@@ -396,7 +396,8 @@ def test_get_loc_missing_nan():
         idx.get_loc(3)
     with pytest.raises(KeyError, match=r"^nan$"):
         idx.get_loc(np.nan)
-    with pytest.raises(KeyError, match=r"^\[nan\]$"):
+    with pytest.raises(TypeError, match=r"'\[nan\]' is an invalid key"):
+        # listlike/non-hashable raises TypeError
         idx.get_loc([np.nan])
 
 
diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py
@@ -389,7 +389,8 @@ def test_get_loc_missing_nan(self):
             idx.get_loc(3)
         with pytest.raises(KeyError, match="^nan$"):
             idx.get_loc(np.nan)
-        with pytest.raises(KeyError, match=r"^\[nan\]$"):
+        with pytest.raises(TypeError, match=r"'\[nan\]' is an invalid key"):
+            # listlike/non-hashable raises TypeError
             idx.get_loc([np.nan])
 
     def test_contains_nans(self):
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
@@ -1158,9 +1158,9 @@ def test_displayed_only(self, displayed_only, exp0, exp1):
             assert len(dfs) == 1  # Should not parse hidden table
 
     def test_encode(self, html_encoding_file):
-        _, encoding = os.path.splitext(os.path.basename(html_encoding_file))[0].split(
-            "_"
-        )
+        base_path = os.path.basename(html_encoding_file)
+        root = os.path.splitext(base_path)[0]
+        _, encoding = root.split("_")
 
         try:
             with open(html_encoding_file, "rb") as fobj:
@@ -1183,7 +1183,7 @@ def test_encode(self, html_encoding_file):
             if is_platform_windows():
                 if "16" in encoding or "32" in encoding:
                     pytest.skip()
-                raise
+            raise
 
     def test_parse_failure_unseekable(self):
         # Issue #17975
diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md
@@ -84,19 +84,16 @@ pandas with the option to perform statistical estimation while plotting,
 aggregating across observations and visualizing the fit of statistical
 models to emphasize patterns in a dataset.
 
-### [yhat/ggpy](https://github.com/yhat/ggpy)
+### [plotnine](https://github.com/has2k1/plotnine/)
 
 Hadley Wickham's [ggplot2](https://ggplot2.tidyverse.org/) is a
 foundational exploratory visualization package for the R language. Based
 on ["The Grammar of
 Graphics"](https://www.cs.uic.edu/~wilkinson/TheGrammarOfGraphics/GOG.html)
 it provides a powerful, declarative and extremely general way to
-generate bespoke plots of any kind of data. It's really quite
-incredible. Various implementations to other languages are available,
-but a faithful implementation for Python users has long been missing.
-Although still young (as of Jan-2014), the
-[yhat/ggpy](https://github.com/yhat/ggpy) project has been progressing
-quickly in that direction.
+generate bespoke plots of any kind of data.
+Various implementations to other languages are available.
+A good implementation for Python users is [has2k1/plotnine](https://github.com/has2k1/plotnine/).
 
 ### [IPython Vega](https://github.com/vega/ipyvega)