Skip to content

Commit 6296e03

Browse files
authored
TST: de-xfail some pyarrow tests (#55918)
* TST: un-xfail pyarrow verbose tests * un-xfail pyarrow tests * de-xfail pyarrow tests * de-xfail pyarrow tests * de-xfail pyarrow tests * de-xfail pyarrow tests * de-xfail pyarrow tests * de-xfail pyarrow test * De-xfail pyarrow tests
1 parent 590dfe1 commit 6296e03

10 files changed

+226
-62
lines changed

pandas/io/parsers/arrow_parser_wrapper.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
)
1414
from pandas.util._exceptions import find_stack_level
1515

16+
from pandas.core.dtypes.common import pandas_dtype
1617
from pandas.core.dtypes.inference import is_integer
1718

1819
import pandas as pd
@@ -203,7 +204,13 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
203204
# Ignore non-existent columns from dtype mapping
204205
# like other parsers do
205206
if isinstance(self.dtype, dict):
206-
self.dtype = {k: v for k, v in self.dtype.items() if k in frame.columns}
207+
self.dtype = {
208+
k: pandas_dtype(v)
209+
for k, v in self.dtype.items()
210+
if k in frame.columns
211+
}
212+
else:
213+
self.dtype = pandas_dtype(self.dtype)
207214
try:
208215
frame = frame.astype(self.dtype)
209216
except TypeError as e:

pandas/tests/io/parser/common/test_common_basic.py

+63-9
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,6 @@ def test_read_csv_local(all_parsers, csv1):
119119
tm.assert_frame_equal(result, expected)
120120

121121

122-
@xfail_pyarrow
123122
def test_1000_sep(all_parsers):
124123
parser = all_parsers
125124
data = """A|B|C
@@ -128,6 +127,12 @@ def test_1000_sep(all_parsers):
128127
"""
129128
expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]})
130129

130+
if parser.engine == "pyarrow":
131+
msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
132+
with pytest.raises(ValueError, match=msg):
133+
parser.read_csv(StringIO(data), sep="|", thousands=",")
134+
return
135+
131136
result = parser.read_csv(StringIO(data), sep="|", thousands=",")
132137
tm.assert_frame_equal(result, expected)
133138

@@ -161,7 +166,6 @@ def test_csv_mixed_type(all_parsers):
161166
tm.assert_frame_equal(result, expected)
162167

163168

164-
@xfail_pyarrow
165169
def test_read_csv_low_memory_no_rows_with_index(all_parsers):
166170
# see gh-21141
167171
parser = all_parsers
@@ -174,6 +178,13 @@ def test_read_csv_low_memory_no_rows_with_index(all_parsers):
174178
2,2,3,4
175179
3,3,4,5
176180
"""
181+
182+
if parser.engine == "pyarrow":
183+
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
184+
with pytest.raises(ValueError, match=msg):
185+
parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
186+
return
187+
177188
result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
178189
expected = DataFrame(columns=["A", "B", "C"])
179190
tm.assert_frame_equal(result, expected)
@@ -212,7 +223,6 @@ def test_read_csv_dataframe(all_parsers, csv1):
212223
tm.assert_frame_equal(result, expected)
213224

214225

215-
@xfail_pyarrow
216226
@pytest.mark.parametrize("nrows", [3, 3.0])
217227
def test_read_nrows(all_parsers, nrows):
218228
# see gh-10476
@@ -230,11 +240,16 @@ def test_read_nrows(all_parsers, nrows):
230240
)
231241
parser = all_parsers
232242

243+
if parser.engine == "pyarrow":
244+
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
245+
with pytest.raises(ValueError, match=msg):
246+
parser.read_csv(StringIO(data), nrows=nrows)
247+
return
248+
233249
result = parser.read_csv(StringIO(data), nrows=nrows)
234250
tm.assert_frame_equal(result, expected)
235251

236252

237-
@xfail_pyarrow
238253
@pytest.mark.parametrize("nrows", [1.2, "foo", -1])
239254
def test_read_nrows_bad(all_parsers, nrows):
240255
data = """index,A,B,C,D
@@ -247,6 +262,8 @@ def test_read_nrows_bad(all_parsers, nrows):
247262
"""
248263
msg = r"'nrows' must be an integer >=0"
249264
parser = all_parsers
265+
if parser.engine == "pyarrow":
266+
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
250267

251268
with pytest.raises(ValueError, match=msg):
252269
parser.read_csv(StringIO(data), nrows=nrows)
@@ -277,7 +294,6 @@ def test_missing_trailing_delimiters(all_parsers):
277294
tm.assert_frame_equal(result, expected)
278295

279296

280-
@xfail_pyarrow
281297
def test_skip_initial_space(all_parsers):
282298
data = (
283299
'"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
@@ -289,6 +305,18 @@ def test_skip_initial_space(all_parsers):
289305
)
290306
parser = all_parsers
291307

308+
if parser.engine == "pyarrow":
309+
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
310+
with pytest.raises(ValueError, match=msg):
311+
parser.read_csv(
312+
StringIO(data),
313+
names=list(range(33)),
314+
header=None,
315+
na_values=["-9999.0"],
316+
skipinitialspace=True,
317+
)
318+
return
319+
292320
result = parser.read_csv(
293321
StringIO(data),
294322
names=list(range(33)),
@@ -437,7 +465,6 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
437465
tm.assert_frame_equal(result, expected)
438466

439467

440-
@xfail_pyarrow
441468
@pytest.mark.parametrize(
442469
"kwargs,expected",
443470
[
@@ -467,6 +494,12 @@ def test_trailing_spaces(all_parsers, kwargs, expected):
467494
data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501
468495
parser = all_parsers
469496

497+
if parser.engine == "pyarrow":
498+
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
499+
with pytest.raises(ValueError, match=msg):
500+
parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
501+
return
502+
470503
result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
471504
tm.assert_frame_equal(result, expected)
472505

@@ -488,7 +521,6 @@ def test_read_filepath_or_buffer(all_parsers):
488521
parser.read_csv(filepath_or_buffer=b"input")
489522

490523

491-
@xfail_pyarrow
492524
@pytest.mark.parametrize("delim_whitespace", [True, False])
493525
def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
494526
# see gh-9710
@@ -501,6 +533,15 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
501533
b\n"""
502534

503535
expected = DataFrame({"MyColumn": list("abab")})
536+
537+
if parser.engine == "pyarrow":
538+
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
539+
with pytest.raises(ValueError, match=msg):
540+
parser.read_csv(
541+
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
542+
)
543+
return
544+
504545
result = parser.read_csv(
505546
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
506547
)
@@ -688,7 +729,6 @@ def test_first_row_bom_unquoted(all_parsers):
688729
tm.assert_frame_equal(result, expected)
689730

690731

691-
@xfail_pyarrow
692732
@pytest.mark.parametrize("nrows", range(1, 6))
693733
def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
694734
# GH 28071
@@ -698,6 +738,15 @@ def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
698738
)
699739
csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
700740
parser = all_parsers
741+
742+
if parser.engine == "pyarrow":
743+
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
744+
with pytest.raises(ValueError, match=msg):
745+
parser.read_csv(
746+
StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False
747+
)
748+
return
749+
701750
df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
702751
tm.assert_frame_equal(df, ref[:nrows])
703752

@@ -731,11 +780,16 @@ def test_read_csv_names_not_accepting_sets(all_parsers):
731780
parser.read_csv(StringIO(data), names=set("QAZ"))
732781

733782

734-
@xfail_pyarrow
735783
def test_read_table_delim_whitespace_default_sep(all_parsers):
736784
# GH: 35958
737785
f = StringIO("a b c\n1 -2 -3\n4 5 6")
738786
parser = all_parsers
787+
788+
if parser.engine == "pyarrow":
789+
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
790+
with pytest.raises(ValueError, match=msg):
791+
parser.read_table(f, delim_whitespace=True)
792+
return
739793
result = parser.read_table(f, delim_whitespace=True)
740794
expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]})
741795
tm.assert_frame_equal(result, expected)

pandas/tests/io/parser/common/test_decimal.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,7 @@
1313
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
1414
)
1515

16-
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
1716

18-
19-
@xfail_pyarrow
2017
@pytest.mark.parametrize(
2118
"data,thousands,decimal",
2219
[
@@ -42,6 +39,14 @@ def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal):
4239
parser = all_parsers
4340
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
4441

42+
if parser.engine == "pyarrow":
43+
msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
44+
with pytest.raises(ValueError, match=msg):
45+
parser.read_csv(
46+
StringIO(data), sep="|", thousands=thousands, decimal=decimal
47+
)
48+
return
49+
4550
result = parser.read_csv(
4651
StringIO(data), sep="|", thousands=thousands, decimal=decimal
4752
)

pandas/tests/io/parser/common/test_file_buffer_url.py

+37-9
Original file line numberDiff line numberDiff line change
@@ -214,8 +214,14 @@ def test_eof_states(all_parsers, data, kwargs, expected, msg, request):
214214
# see gh-10728, gh-10548
215215
parser = all_parsers
216216

217+
if parser.engine == "pyarrow" and "comment" in kwargs:
218+
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
219+
with pytest.raises(ValueError, match=msg):
220+
parser.read_csv(StringIO(data), **kwargs)
221+
return
222+
217223
if parser.engine == "pyarrow" and "\r" not in data:
218-
mark = pytest.mark.xfail(reason="The 'comment' option is not supported")
224+
mark = pytest.mark.xfail(reason="Mismatched exception type/message")
219225
request.applymarker(mark)
220226

221227
if expected is None:
@@ -356,7 +362,6 @@ def test_read_csv_file_handle(all_parsers, io_class, encoding):
356362
assert not handle.closed
357363

358364

359-
@xfail_pyarrow # ValueError: The 'memory_map' option is not supported
360365
def test_memory_map_compression(all_parsers, compression):
361366
"""
362367
Support memory map for compressed files.
@@ -369,19 +374,32 @@ def test_memory_map_compression(all_parsers, compression):
369374
with tm.ensure_clean() as path:
370375
expected.to_csv(path, index=False, compression=compression)
371376

372-
tm.assert_frame_equal(
373-
parser.read_csv(path, memory_map=True, compression=compression),
374-
expected,
375-
)
377+
if parser.engine == "pyarrow":
378+
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
379+
with pytest.raises(ValueError, match=msg):
380+
parser.read_csv(path, memory_map=True, compression=compression)
381+
return
382+
383+
result = parser.read_csv(path, memory_map=True, compression=compression)
384+
385+
tm.assert_frame_equal(
386+
result,
387+
expected,
388+
)
376389

377390

378-
@xfail_pyarrow # ValueError: The 'chunksize' option is not supported
379391
def test_context_manager(all_parsers, datapath):
380392
# make sure that opened files are closed
381393
parser = all_parsers
382394

383395
path = datapath("io", "data", "csv", "iris.csv")
384396

397+
if parser.engine == "pyarrow":
398+
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
399+
with pytest.raises(ValueError, match=msg):
400+
parser.read_csv(path, chunksize=1)
401+
return
402+
385403
reader = parser.read_csv(path, chunksize=1)
386404
assert not reader.handles.handle.closed
387405
try:
@@ -392,12 +410,17 @@ def test_context_manager(all_parsers, datapath):
392410
assert reader.handles.handle.closed
393411

394412

395-
@xfail_pyarrow # ValueError: The 'chunksize' option is not supported
396413
def test_context_manageri_user_provided(all_parsers, datapath):
397414
# make sure that user-provided handles are not closed
398415
parser = all_parsers
399416

400417
with open(datapath("io", "data", "csv", "iris.csv"), encoding="utf-8") as path:
418+
if parser.engine == "pyarrow":
419+
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
420+
with pytest.raises(ValueError, match=msg):
421+
parser.read_csv(path, chunksize=1)
422+
return
423+
401424
reader = parser.read_csv(path, chunksize=1)
402425
assert not reader.handles.handle.closed
403426
try:
@@ -417,7 +440,6 @@ def test_file_descriptor_leak(all_parsers, using_copy_on_write):
417440
parser.read_csv(path)
418441

419442

420-
@xfail_pyarrow # ValueError: The 'memory_map' option is not supported
421443
def test_memory_map(all_parsers, csv_dir_path):
422444
mmap_file = os.path.join(csv_dir_path, "test_mmap.csv")
423445
parser = all_parsers
@@ -426,5 +448,11 @@ def test_memory_map(all_parsers, csv_dir_path):
426448
{"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]}
427449
)
428450

451+
if parser.engine == "pyarrow":
452+
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
453+
with pytest.raises(ValueError, match=msg):
454+
parser.read_csv(mmap_file, memory_map=True)
455+
return
456+
429457
result = parser.read_csv(mmap_file, memory_map=True)
430458
tm.assert_frame_equal(result, expected)

pandas/tests/io/parser/common/test_ints.py

+16-8
Original file line numberDiff line numberDiff line change
@@ -126,10 +126,8 @@ def test_int64_min_issues(all_parsers):
126126
tm.assert_frame_equal(result, expected)
127127

128128

129-
# ValueError: The 'converters' option is not supported with the 'pyarrow' engine
130-
@xfail_pyarrow
131129
@pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
132-
def test_int64_overflow(all_parsers, conv):
130+
def test_int64_overflow(all_parsers, conv, request):
133131
data = """ID
134132
00013007854817840016671868
135133
00013007854817840016749251
@@ -143,6 +141,10 @@ def test_int64_overflow(all_parsers, conv):
143141
if conv is None:
144142
# 13007854817840016671868 > UINT64_MAX, so this
145143
# will overflow and return object as the dtype.
144+
if parser.engine == "pyarrow":
145+
mark = pytest.mark.xfail(reason="parses to float64")
146+
request.applymarker(mark)
147+
146148
result = parser.read_csv(StringIO(data))
147149
expected = DataFrame(
148150
[
@@ -161,13 +163,19 @@ def test_int64_overflow(all_parsers, conv):
161163
# 13007854817840016671868 > UINT64_MAX, so attempts
162164
# to cast to either int64 or uint64 will result in
163165
# an OverflowError being raised.
164-
msg = (
165-
"(Python int too large to convert to C long)|"
166-
"(long too big to convert)|"
167-
"(int too big to convert)"
166+
msg = "|".join(
167+
[
168+
"Python int too large to convert to C long",
169+
"long too big to convert",
170+
"int too big to convert",
171+
]
168172
)
173+
err = OverflowError
174+
if parser.engine == "pyarrow":
175+
err = ValueError
176+
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
169177

170-
with pytest.raises(OverflowError, match=msg):
178+
with pytest.raises(err, match=msg):
171179
parser.read_csv(StringIO(data), converters={"ID": conv})
172180

173181

0 commit comments

Comments
 (0)