CI: Make is_ci_environment less necessary (#56058)

mroeschke · web-flow · commit 8438fe76a37f · 2023-11-20T10:34:53.000-08:00
* CI: Make is_ci_environment less necessary

* Add back env settingg

* Add back comment

* Refactor test_read_csv_chunked_download
diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -621,4 +621,15 @@ def time_read_csv_index_col(self):
         )
 
 
+class ReadCSVCParserLowMemory:
+    # GH 16798
+    def setup(self):
+        self.csv = StringIO(
+            "strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)])
+        )
+
+    def peakmem_over_2gb_input(self):
+        read_csv(self.csv, engine="c", low_memory=False)
+
+
 from ..pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py
@@ -51,23 +51,7 @@ def xml_file(datapath):
 
 
 @pytest.fixture
-def s3so(worker_id):
-    if is_ci_environment():
-        url = "http://localhost:5000/"
-    else:
-        worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw")
-        url = f"http://127.0.0.1:555{worker_id}/"
-    return {"client_kwargs": {"endpoint_url": url}}
-
-
-@pytest.fixture(scope="function" if is_ci_environment() else "session")
-def monkeysession():
-    with pytest.MonkeyPatch.context() as mp:
-        yield mp
-
-
-@pytest.fixture(scope="function" if is_ci_environment() else "session")
-def s3_base(worker_id, monkeysession):
+def s3_base(worker_id, monkeypatch):
     """
     Fixture for mocking S3 interaction.
 
@@ -79,8 +63,8 @@ def s3_base(worker_id, monkeysession):
 
     # temporary workaround as moto fails for botocore >= 1.11 otherwise,
     # see https://github.com/spulec/moto/issues/1924 & 1952
-    monkeysession.setenv("AWS_ACCESS_KEY_ID", "foobar_key")
-    monkeysession.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret")
+    monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key")
+    monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret")
     if is_ci_environment():
         if is_platform_arm() or is_platform_mac() or is_platform_windows():
             # NOT RUN on Windows/macOS/ARM, only Ubuntu
@@ -93,6 +77,7 @@ def s3_base(worker_id, monkeysession):
                 "Windows, macOS or ARM platforms"
             )
         else:
+            # set in .github/workflows/unit-tests.yml
             yield "http://localhost:5000"
     else:
         requests = pytest.importorskip("requests")
@@ -128,6 +113,11 @@ def s3_base(worker_id, monkeysession):
             proc.terminate()
 
 
+@pytest.fixture
+def s3so(s3_base):
+    return {"client_kwargs": {"endpoint_url": s3_base}}
+
+
 @pytest.fixture
 def s3_resource(s3_base):
     import boto3
diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py
@@ -17,7 +17,6 @@
 import numpy as np
 import pytest
 
-from pandas.compat import is_ci_environment
 from pandas.compat.numpy import np_version_gte1p24
 from pandas.errors import (
     ParserError,
@@ -531,24 +530,6 @@ def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix):
         tm.assert_frame_equal(out, expected)
 
 
-@pytest.mark.single_cpu
-@pytest.mark.skipif(is_ci_environment(), reason="Too memory intensive for CI.")
-def test_bytes_exceed_2gb(c_parser_only):
-    # see gh-16798
-    #
-    # Read from a "CSV" that has a column larger than 2GB.
-    parser = c_parser_only
-
-    if parser.low_memory:
-        pytest.skip("not a low_memory test")
-
-    # csv takes 10 seconds to construct, spikes memory to 8GB+, the whole test
-    #  spikes up to 10.4GB on the c_high case
-    csv = StringIO("strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)]))
-    df = parser.read_csv(csv)
-    assert not df.empty
-
-
 def test_chunk_whitespace_on_boundary(c_parser_only):
     # see gh-9735: this issue is C parser-specific (bug when
     # parsing whitespace and characters at chunk boundary)
diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py
@@ -2,16 +2,13 @@
 Tests parsers ability to read and parse non-local files
 and hence require a network connection to be read.
 """
-from io import (
-    BytesIO,
-    StringIO,
-)
+from io import BytesIO
 import logging
+import re
 
 import numpy as np
 import pytest
 
-from pandas.compat import is_ci_environment
 import pandas.util._test_decorators as td
 
 from pandas import DataFrame
@@ -292,39 +289,23 @@ def test_read_csv_handles_boto_s3_object(
         tm.assert_frame_equal(result, expected)
 
     @pytest.mark.single_cpu
-    @pytest.mark.skipif(
-        is_ci_environment(),
-        reason="GH: 45651: This test can hang in our CI min_versions build",
-    )
     def test_read_csv_chunked_download(self, s3_public_bucket, caplog, s3so):
         # 8 MB, S3FS uses 5MB chunks
-        import s3fs
-
-        df = DataFrame(
-            np.random.default_rng(2).standard_normal((100000, 4)), columns=list("abcd")
-        )
-        str_buf = StringIO()
-
-        df.to_csv(str_buf)
-
-        buf = BytesIO(str_buf.getvalue().encode("utf-8"))
-
-        s3_public_bucket.put_object(Key="large-file.csv", Body=buf)
-
-        # Possibly some state leaking in between tests.
-        # If we don't clear this cache, we saw `GetObject operation: Forbidden`.
-        # Presumably the s3fs instance is being cached, with the directory listing
-        # from *before* we add the large-file.csv in the s3_public_bucket_with_data.
-        s3fs.S3FileSystem.clear_instance_cache()
-
-        with caplog.at_level(logging.DEBUG, logger="s3fs"):
-            read_csv(
-                f"s3://{s3_public_bucket.name}/large-file.csv",
-                nrows=5,
-                storage_options=s3so,
-            )
-            # log of fetch_range (start, stop)
-            assert (0, 5505024) in (x.args[-2:] for x in caplog.records)
+        df = DataFrame(np.zeros((100000, 4)), columns=list("abcd"))
+        with BytesIO(df.to_csv().encode("utf-8")) as buf:
+            s3_public_bucket.put_object(Key="large-file.csv", Body=buf)
+            uri = f"{s3_public_bucket.name}/large-file.csv"
+            match_re = re.compile(rf"^Fetch: {uri}, 0-(?P<stop>\d+)$")
+            with caplog.at_level(logging.DEBUG, logger="s3fs"):
+                read_csv(
+                    f"s3://{uri}",
+                    nrows=5,
+                    storage_options=s3so,
+                )
+                for log in caplog.messages:
+                    if match := re.match(match_re, log):
+                        # Less than 8 MB
+                        assert int(match.group("stop")) < 8000000
 
     def test_read_s3_with_hash_in_key(self, s3_public_bucket_with_data, tips_df, s3so):
         # GH 25945
diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py
@@ -30,15 +30,10 @@ def test_read_without_creds_from_pub_bucket(s3_public_bucket_with_data, s3so):
 
 
 @pytest.mark.single_cpu
-def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, monkeypatch, s3so):
+def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, s3so):
     # Ensure we can read from a public bucket with credentials
     # GH 34626
-
-    # temporary workaround as moto fails for botocore >= 1.11 otherwise,
-    # see https://github.com/spulec/moto/issues/1924 & 1952
     pytest.importorskip("s3fs")
-    monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key")
-    monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret")
     df = read_csv(
         f"s3://{s3_public_bucket_with_data.name}/tips.csv",
         nrows=5,
diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py
@@ -1,11 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas.compat import (
-    is_ci_environment,
-    is_platform_mac,
-    is_platform_windows,
-)
 from pandas.errors import NumbaUtilError
 import pandas.util._test_decorators as td
 
@@ -17,15 +12,7 @@
 )
 import pandas._testing as tm
 
-pytestmark = [
-    pytest.mark.single_cpu,
-    pytest.mark.skipif(
-        is_ci_environment() and (is_platform_windows() or is_platform_mac()),
-        reason="On GHA CI, Windows can fail with "
-        "'Windows fatal exception: stack overflow' "
-        "and macOS can timeout",
-    ),
-]
+pytestmark = pytest.mark.single_cpu
 
 
 @pytest.fixture(params=["single", "table"])
diff --git a/pandas/tests/window/test_online.py b/pandas/tests/window/test_online.py
@@ -1,27 +1,13 @@
 import numpy as np
 import pytest
 
-from pandas.compat import (
-    is_ci_environment,
-    is_platform_mac,
-    is_platform_windows,
-)
-
 from pandas import (
     DataFrame,
     Series,
 )
 import pandas._testing as tm
 
-pytestmark = [
-    pytest.mark.single_cpu,
-    pytest.mark.skipif(
-        is_ci_environment() and (is_platform_windows() or is_platform_mac()),
-        reason="On GHA CI, Windows can fail with "
-        "'Windows fatal exception: stack overflow' "
-        "and macOS can timeout",
-    ),
-]
+pytestmark = pytest.mark.single_cpu
 
 pytest.importorskip("numba")