I am done

This commit is contained in:
2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions

View File

@ -0,0 +1,378 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import numpy as np
import pytest
from pandas._libs import parsers as libparsers
from pandas.errors import DtypeWarning
from pandas import (
DataFrame,
concat,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.mark.parametrize("index_col", [0, "index"])
def test_read_chunksize_with_index(all_parsers, index_col):
parser = all_parsers
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
expected = DataFrame(
[
["foo", 2, 3, 4, 5],
["bar", 7, 8, 9, 10],
["baz", 12, 13, 14, 15],
["qux", 12, 13, 14, 15],
["foo2", 12, 13, 14, 15],
["bar2", 12, 13, 14, 15],
],
columns=["index", "A", "B", "C", "D"],
)
expected = expected.set_index("index")
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
list(reader)
return
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
chunks = list(reader)
tm.assert_frame_equal(chunks[0], expected[:2])
tm.assert_frame_equal(chunks[1], expected[2:4])
tm.assert_frame_equal(chunks[2], expected[4:])
@pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
def test_read_chunksize_bad(all_parsers, chunksize):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
msg = r"'chunksize' must be an integer >=1"
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), chunksize=chunksize) as _:
pass
@pytest.mark.parametrize("chunksize", [2, 8])
def test_read_chunksize_and_nrows(all_parsers, chunksize):
# see gh-15755
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
kwargs = {"index_col": 0, "nrows": 5}
if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
return
expected = parser.read_csv(StringIO(data), **kwargs)
with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader:
tm.assert_frame_equal(concat(reader), expected)
def test_read_chunksize_and_nrows_changing_size(all_parsers):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
kwargs = {"index_col": 0, "nrows": 5}
if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
return
expected = parser.read_csv(StringIO(data), **kwargs)
with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader:
tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5])
with pytest.raises(StopIteration, match=""):
reader.get_chunk(size=3)
def test_get_chunk_passed_chunksize(all_parsers):
parser = all_parsers
data = """A,B,C
1,2,3
4,5,6
7,8,9
1,2,3"""
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), chunksize=2) as reader:
reader.get_chunk()
return
with parser.read_csv(StringIO(data), chunksize=2) as reader:
result = reader.get_chunk()
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}])
def test_read_chunksize_compat(all_parsers, kwargs):
# see gh-12185
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), **kwargs)
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
concat(reader)
return
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
via_reader = concat(reader)
tm.assert_frame_equal(via_reader, result)
def test_read_chunksize_jagged_names(all_parsers):
# see gh-23509
parser = all_parsers
data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(
StringIO(data), names=range(10), chunksize=4
) as reader:
concat(reader)
return
with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
result = concat(reader)
tm.assert_frame_equal(result, expected)
def test_chunk_begins_with_newline_whitespace(all_parsers):
# see gh-10022
parser = all_parsers
data = "\n hello\nworld\n"
result = parser.read_csv(StringIO(data), header=None)
expected = DataFrame([" hello", "world"])
tm.assert_frame_equal(result, expected)
@pytest.mark.slow
def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
# mainly an issue with the C parser
heuristic = 2**3
parser = all_parsers
integers = [str(i) for i in range(heuristic - 1)]
data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
# Coercions should work without warnings.
with monkeypatch.context() as m:
m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic)
result = parser.read_csv(StringIO(data))
assert type(result.a[0]) is np.float64
assert result.a.dtype == float
def test_warn_if_chunks_have_mismatched_type(all_parsers):
warning_type = None
parser = all_parsers
size = 10000
# see gh-3866: if chunks are different types and can't
# be coerced using numerical types, then issue warning.
if parser.engine == "c" and parser.low_memory:
warning_type = DtypeWarning
# Use larger size to hit warning path
size = 499999
integers = [str(i) for i in range(size)]
data = "a\n" + "\n".join(integers + ["a", "b"] + integers)
buf = StringIO(data)
if parser.engine == "pyarrow":
df = parser.read_csv(
buf,
)
else:
df = parser.read_csv_check_warnings(
warning_type,
r"Columns \(0\) have mixed types. "
"Specify dtype option on import or set low_memory=False.",
buf,
)
assert df.a.dtype == object
@pytest.mark.parametrize("iterator", [True, False])
def test_empty_with_nrows_chunksize(all_parsers, iterator):
# see gh-9535
parser = all_parsers
expected = DataFrame(columns=["foo", "bar"])
nrows = 10
data = StringIO("foo,bar\n")
if parser.engine == "pyarrow":
msg = (
"The '(nrows|chunksize)' option is not supported with the 'pyarrow' engine"
)
with pytest.raises(ValueError, match=msg):
if iterator:
with parser.read_csv(data, chunksize=nrows) as reader:
next(iter(reader))
else:
parser.read_csv(data, nrows=nrows)
return
if iterator:
with parser.read_csv(data, chunksize=nrows) as reader:
result = next(iter(reader))
else:
result = parser.read_csv(data, nrows=nrows)
tm.assert_frame_equal(result, expected)
def test_read_csv_memory_growth_chunksize(all_parsers):
# see gh-24805
#
# Let's just make sure that we don't crash
# as we iteratively process all chunks.
parser = all_parsers
with tm.ensure_clean() as path:
with open(path, "w", encoding="utf-8") as f:
for i in range(1000):
f.write(str(i) + "\n")
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(path, chunksize=20) as result:
for _ in result:
pass
return
with parser.read_csv(path, chunksize=20) as result:
for _ in result:
pass
def test_chunksize_with_usecols_second_block_shorter(all_parsers):
# GH#21211
parser = all_parsers
data = """1,2,3,4
5,6,7,8
9,10,11
"""
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data),
names=["a", "b"],
chunksize=2,
usecols=[0, 1],
header=None,
)
return
result_chunks = parser.read_csv(
StringIO(data),
names=["a", "b"],
chunksize=2,
usecols=[0, 1],
header=None,
)
expected_frames = [
DataFrame({"a": [1, 5], "b": [2, 6]}),
DataFrame({"a": [9], "b": [10]}, index=[2]),
]
for i, result in enumerate(result_chunks):
tm.assert_frame_equal(result, expected_frames[i])
def test_chunksize_second_block_shorter(all_parsers):
# GH#21211
parser = all_parsers
data = """a,b,c,d
1,2,3,4
5,6,7,8
9,10,11
"""
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), chunksize=2)
return
result_chunks = parser.read_csv(StringIO(data), chunksize=2)
expected_frames = [
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
DataFrame({"a": [9], "b": [10], "c": [11], "d": [np.nan]}, index=[2]),
]
for i, result in enumerate(result_chunks):
tm.assert_frame_equal(result, expected_frames[i])

View File

@ -0,0 +1,979 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from datetime import datetime
from inspect import signature
from io import StringIO
import os
from pathlib import Path
import sys
import numpy as np
import pytest
from pandas.errors import (
EmptyDataError,
ParserError,
ParserWarning,
)
from pandas import (
DataFrame,
Index,
Timestamp,
compat,
)
import pandas._testing as tm
from pandas.io.parsers import TextFileReader
from pandas.io.parsers.c_parser_wrapper import CParserWrapper
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
def test_override_set_noconvert_columns():
# see gh-17351
#
# Usecols needs to be sorted in _set_noconvert_columns based
# on the test_usecols_with_parse_dates test from test_usecols.py
class MyTextFileReader(TextFileReader):
def __init__(self) -> None:
self._currow = 0
self.squeeze = False
class MyCParserWrapper(CParserWrapper):
def _set_noconvert_columns(self):
if self.usecols_dtype == "integer":
# self.usecols is a set, which is documented as unordered
# but in practice, a CPython set of integers is sorted.
# In other implementations this assumption does not hold.
# The following code simulates a different order, which
# before GH 17351 would cause the wrong columns to be
# converted via the parse_dates parameter
self.usecols = list(self.usecols)
self.usecols.reverse()
return CParserWrapper._set_noconvert_columns(self)
data = """a,b,c,d,e
0,1,2014-01-01,09:00,4
0,1,2014-01-02,10:00,4"""
parse_dates = [[1, 2]]
cols = {
"a": [0, 0],
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
}
expected = DataFrame(cols, columns=["c_d", "a"])
parser = MyTextFileReader()
parser.options = {
"usecols": [0, 2, 3],
"parse_dates": parse_dates,
"delimiter": ",",
}
parser.engine = "c"
parser._engine = MyCParserWrapper(StringIO(data), **parser.options)
result = parser.read()
tm.assert_frame_equal(result, expected)
def test_read_csv_local(all_parsers, csv1):
prefix = "file:///" if compat.is_platform_windows() else "file://"
parser = all_parsers
fname = prefix + str(os.path.abspath(csv1))
result = parser.read_csv(fname, index_col=0, parse_dates=True)
# TODO: make unit check more specific
if parser.engine == "pyarrow":
result.index = result.index.as_unit("ns")
expected = DataFrame(
[
[0.980269, 3.685731, -0.364216805298, -1.159738],
[1.047916, -0.041232, -0.16181208307, 0.212549],
[0.498581, 0.731168, -0.537677223318, 1.346270],
[1.120202, 1.567621, 0.00364077397681, 0.675253],
[-0.487094, 0.571455, -1.6116394093, 0.103469],
[0.836649, 0.246462, 0.588542635376, 1.062782],
[-0.157161, 1.340307, 1.1957779562, -1.097007],
],
columns=["A", "B", "C", "D"],
index=Index(
[
datetime(2000, 1, 3),
datetime(2000, 1, 4),
datetime(2000, 1, 5),
datetime(2000, 1, 6),
datetime(2000, 1, 7),
datetime(2000, 1, 10),
datetime(2000, 1, 11),
],
name="index",
),
)
tm.assert_frame_equal(result, expected)
def test_1000_sep(all_parsers):
parser = all_parsers
data = """A|B|C
1|2,334|5
10|13|10.
"""
expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]})
if parser.engine == "pyarrow":
msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), sep="|", thousands=",")
return
result = parser.read_csv(StringIO(data), sep="|", thousands=",")
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: Found non-unique column index
def test_unnamed_columns(all_parsers):
data = """A,B,C,,
1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""
parser = all_parsers
expected = DataFrame(
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]],
dtype=np.int64,
columns=["A", "B", "C", "Unnamed: 3", "Unnamed: 4"],
)
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
def test_csv_mixed_type(all_parsers):
data = """A,B,C
a,1,2
b,3,4
c,4,5
"""
parser = all_parsers
expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]})
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
def test_read_csv_low_memory_no_rows_with_index(all_parsers):
# see gh-21141
parser = all_parsers
if not parser.low_memory:
pytest.skip("This is a low-memory specific test")
data = """A,B,C
1,1,1,2
2,2,3,4
3,3,4,5
"""
if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
return
result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
expected = DataFrame(columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
def test_read_csv_dataframe(all_parsers, csv1):
parser = all_parsers
result = parser.read_csv(csv1, index_col=0, parse_dates=True)
# TODO: make unit check more specific
if parser.engine == "pyarrow":
result.index = result.index.as_unit("ns")
expected = DataFrame(
[
[0.980269, 3.685731, -0.364216805298, -1.159738],
[1.047916, -0.041232, -0.16181208307, 0.212549],
[0.498581, 0.731168, -0.537677223318, 1.346270],
[1.120202, 1.567621, 0.00364077397681, 0.675253],
[-0.487094, 0.571455, -1.6116394093, 0.103469],
[0.836649, 0.246462, 0.588542635376, 1.062782],
[-0.157161, 1.340307, 1.1957779562, -1.097007],
],
columns=["A", "B", "C", "D"],
index=Index(
[
datetime(2000, 1, 3),
datetime(2000, 1, 4),
datetime(2000, 1, 5),
datetime(2000, 1, 6),
datetime(2000, 1, 7),
datetime(2000, 1, 10),
datetime(2000, 1, 11),
],
name="index",
),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("nrows", [3, 3.0])
def test_read_nrows(all_parsers, nrows):
# see gh-10476
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
expected = DataFrame(
[["foo", 2, 3, 4, 5], ["bar", 7, 8, 9, 10], ["baz", 12, 13, 14, 15]],
columns=["index", "A", "B", "C", "D"],
)
parser = all_parsers
if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), nrows=nrows)
return
result = parser.read_csv(StringIO(data), nrows=nrows)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("nrows", [1.2, "foo", -1])
def test_read_nrows_bad(all_parsers, nrows):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
msg = r"'nrows' must be an integer >=0"
parser = all_parsers
if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), nrows=nrows)
def test_nrows_skipfooter_errors(all_parsers):
msg = "'skipfooter' not supported with 'nrows'"
data = "a\n1\n2\n3\n4\n5\n6"
parser = all_parsers
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), skipfooter=1, nrows=5)
@skip_pyarrow
def test_missing_trailing_delimiters(all_parsers):
parser = all_parsers
data = """A,B,C,D
1,2,3,4
1,3,3,
1,4,5"""
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[[1, 2, 3, 4], [1, 3, 3, np.nan], [1, 4, 5, np.nan]],
columns=["A", "B", "C", "D"],
)
tm.assert_frame_equal(result, expected)
def test_skip_initial_space(all_parsers):
data = (
'"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
"1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, "
"314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, "
"70.06056, 344.98370, 1, 1, -0.689265, -0.692787, "
"0.212036, 14.7674, 41.605, -9999.0, -9999.0, "
"-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128"
)
parser = all_parsers
if parser.engine == "pyarrow":
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data),
names=list(range(33)),
header=None,
na_values=["-9999.0"],
skipinitialspace=True,
)
return
result = parser.read_csv(
StringIO(data),
names=list(range(33)),
header=None,
na_values=["-9999.0"],
skipinitialspace=True,
)
expected = DataFrame(
[
[
"09-Apr-2012",
"01:10:18.300",
2456026.548822908,
12849,
1.00361,
1.12551,
330.65659,
355626618.16711,
73.48821,
314.11625,
1917.09447,
179.71425,
80.0,
240.0,
-350,
70.06056,
344.9837,
1,
1,
-0.689265,
-0.692787,
0.212036,
14.7674,
41.605,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
0,
12,
128,
]
]
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_trailing_delimiters(all_parsers):
# see gh-2442
data = """A,B,C
1,2,3,
4,5,6,
7,8,9,"""
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=False)
expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]})
tm.assert_frame_equal(result, expected)
def test_escapechar(all_parsers):
# https://stackoverflow.com/questions/13824840/feature-request-for-
# pandas-read-csv
data = '''SEARCH_TERM,ACTUAL_URL
"bra tv board","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"'''
parser = all_parsers
result = parser.read_csv(
StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8"
)
assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals series'
tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"]))
def test_ignore_leading_whitespace(all_parsers):
# see gh-3374, gh-6607
parser = all_parsers
data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9"
if parser.engine == "pyarrow":
msg = "the 'pyarrow' engine does not support regex separators"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), sep=r"\s+")
return
result = parser.read_csv(StringIO(data), sep=r"\s+")
expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]})
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]])
def test_uneven_lines_with_usecols(all_parsers, usecols):
# see gh-12203
parser = all_parsers
data = r"""a,b,c
0,1,2
3,4,5,6,7
8,9,10"""
if usecols is None:
# Make sure that an error is still raised
# when the "usecols" parameter is not provided.
msg = r"Expected \d+ fields in line \d+, saw \d+"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data))
else:
expected = DataFrame({"a": [0, 3, 8], "b": [1, 4, 9]})
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize(
"data,kwargs,expected",
[
# First, check to see that the response of parser when faced with no
# provided columns raises the correct error, with or without usecols.
("", {}, None),
("", {"usecols": ["X"]}, None),
(
",,",
{"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
DataFrame(columns=["X"], index=[0], dtype=np.float64),
),
(
"",
{"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
DataFrame(columns=["X"]),
),
],
)
def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
# see gh-12493
parser = all_parsers
if expected is None:
msg = "No columns to parse from file"
with pytest.raises(EmptyDataError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
else:
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,expected",
[
# gh-8661, gh-8679: this should ignore six lines, including
# lines with trailing whitespace and blank lines.
(
{
"header": None,
"delim_whitespace": True,
"skiprows": [0, 1, 2, 3, 5, 6],
"skip_blank_lines": True,
},
DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]),
),
# gh-8983: test skipping set of rows after a row with trailing spaces.
(
{
"delim_whitespace": True,
"skiprows": [1, 2, 3, 5, 6],
"skip_blank_lines": True,
},
DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}),
),
],
)
def test_trailing_spaces(all_parsers, kwargs, expected):
data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501
parser = all_parsers
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
if parser.engine == "pyarrow":
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
return
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
tm.assert_frame_equal(result, expected)
def test_raise_on_sep_with_delim_whitespace(all_parsers):
# see gh-6607
data = "a b c\n1 2 3"
parser = all_parsers
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with pytest.raises(ValueError, match="you can only specify one"):
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
def test_read_filepath_or_buffer(all_parsers):
# see gh-43366
parser = all_parsers
with pytest.raises(TypeError, match="Expected file path name or file-like"):
parser.read_csv(filepath_or_buffer=b"input")
@pytest.mark.parametrize("delim_whitespace", [True, False])
def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
# see gh-9710
parser = all_parsers
data = """\
MyColumn
a
b
a
b\n"""
expected = DataFrame({"MyColumn": list("abab")})
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
if parser.engine == "pyarrow":
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(
StringIO(data),
skipinitialspace=True,
delim_whitespace=delim_whitespace,
)
return
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"sep,skip_blank_lines,exp_data",
[
(",", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
(r"\s+", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
(
",",
False,
[
[1.0, 2.0, 4.0],
[np.nan, np.nan, np.nan],
[np.nan, np.nan, np.nan],
[5.0, np.nan, 10.0],
[np.nan, np.nan, np.nan],
[-70.0, 0.4, 1.0],
],
),
],
)
def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data, request):
parser = all_parsers
data = """\
A,B,C
1,2.,4.
5.,NaN,10.0
-70,.4,1
"""
if sep == r"\s+":
data = data.replace(",", " ")
if parser.engine == "pyarrow":
msg = "the 'pyarrow' engine does not support regex separators"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines
)
return
result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines)
expected = DataFrame(exp_data, columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_whitespace_lines(all_parsers):
parser = all_parsers
data = """
\t \t\t
\t
A,B,C
\t 1,2.,4.
5.,NaN,10.0
"""
expected = DataFrame([[1, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"])
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,expected",
[
(
""" A B C D
a 1 2 3 4
b 1 2 3 4
c 1 2 3 4
""",
DataFrame(
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
columns=["A", "B", "C", "D"],
index=["a", "b", "c"],
),
),
(
" a b c\n1 2 3 \n4 5 6\n 7 8 9",
DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]),
),
],
)
def test_whitespace_regex_separator(all_parsers, data, expected):
# see gh-6607
parser = all_parsers
if parser.engine == "pyarrow":
msg = "the 'pyarrow' engine does not support regex separators"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), sep=r"\s+")
return
result = parser.read_csv(StringIO(data), sep=r"\s+")
tm.assert_frame_equal(result, expected)
def test_sub_character(all_parsers, csv_dir_path):
# see gh-16893
filename = os.path.join(csv_dir_path, "sub_char.csv")
expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"])
parser = all_parsers
result = parser.read_csv(filename)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"])
def test_filename_with_special_chars(all_parsers, filename):
# see gh-15086.
parser = all_parsers
df = DataFrame({"a": [1, 2, 3]})
with tm.ensure_clean(filename) as path:
df.to_csv(path, index=False)
result = parser.read_csv(path)
tm.assert_frame_equal(result, df)
def test_read_table_same_signature_as_read_csv(all_parsers):
# GH-34976
parser = all_parsers
table_sign = signature(parser.read_table)
csv_sign = signature(parser.read_csv)
assert table_sign.parameters.keys() == csv_sign.parameters.keys()
assert table_sign.return_annotation == csv_sign.return_annotation
for key, csv_param in csv_sign.parameters.items():
table_param = table_sign.parameters[key]
if key == "sep":
assert csv_param.default == ","
assert table_param.default == "\t"
assert table_param.annotation == csv_param.annotation
assert table_param.kind == csv_param.kind
continue
assert table_param == csv_param
def test_read_table_equivalency_to_read_csv(all_parsers):
# see gh-21948
# As of 0.25.0, read_table is undeprecated
parser = all_parsers
data = "a\tb\n1\t2\n3\t4"
expected = parser.read_csv(StringIO(data), sep="\t")
result = parser.read_table(StringIO(data))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("read_func", ["read_csv", "read_table"])
def test_read_csv_and_table_sys_setprofile(all_parsers, read_func):
# GH#41069
parser = all_parsers
data = "a b\n0 1"
sys.setprofile(lambda *a, **k: None)
result = getattr(parser, read_func)(StringIO(data))
sys.setprofile(None)
expected = DataFrame({"a b": ["0 1"]})
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_first_row_bom(all_parsers):
# see gh-26545
parser = all_parsers
data = '''\ufeff"Head1"\t"Head2"\t"Head3"'''
result = parser.read_csv(StringIO(data), delimiter="\t")
expected = DataFrame(columns=["Head1", "Head2", "Head3"])
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_first_row_bom_unquoted(all_parsers):
# see gh-36343
parser = all_parsers
data = """\ufeffHead1\tHead2\tHead3"""
result = parser.read_csv(StringIO(data), delimiter="\t")
expected = DataFrame(columns=["Head1", "Head2", "Head3"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("nrows", range(1, 6))
def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
# GH 28071
ref = DataFrame(
[[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
columns=list("ab"),
)
csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
parser = all_parsers
if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False
)
return
df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
tm.assert_frame_equal(df, ref[:nrows])
@skip_pyarrow
def test_no_header_two_extra_columns(all_parsers):
# GH 26218
column_names = ["one", "two", "three"]
ref = DataFrame([["foo", "bar", "baz"]], columns=column_names)
stream = StringIO("foo,bar,baz,bam,blah")
parser = all_parsers
df = parser.read_csv_check_warnings(
ParserWarning,
"Length of header or names does not match length of data. "
"This leads to a loss of data with index_col=False.",
stream,
header=None,
names=column_names,
index_col=False,
)
tm.assert_frame_equal(df, ref)
def test_read_csv_names_not_accepting_sets(all_parsers):
# GH 34946
data = """\
1,2,3
4,5,6\n"""
parser = all_parsers
with pytest.raises(ValueError, match="Names should be an ordered collection."):
parser.read_csv(StringIO(data), names=set("QAZ"))
def test_read_table_delim_whitespace_default_sep(all_parsers):
# GH: 35958
f = StringIO("a b c\n1 -2 -3\n4 5 6")
parser = all_parsers
depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"
if parser.engine == "pyarrow":
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_table(f, delim_whitespace=True)
return
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_table(f, delim_whitespace=True)
expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("delimiter", [",", "\t"])
def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter):
# GH: 35958
f = StringIO("a b c\n1 -2 -3\n4 5 6")
parser = all_parsers
msg = (
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, sep=delimiter)
with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)
def test_read_csv_delimiter_and_sep_no_default(all_parsers):
# GH#39823
f = StringIO("a,b\n1,2")
parser = all_parsers
msg = "Specified a sep and a delimiter; you can only specify one."
with pytest.raises(ValueError, match=msg):
parser.read_csv(f, sep=" ", delimiter=".")
@pytest.mark.parametrize("kwargs", [{"delimiter": "\n"}, {"sep": "\n"}])
def test_read_csv_line_break_as_separator(kwargs, all_parsers):
# GH#43528
parser = all_parsers
data = """a,b,c
1,2,3
"""
msg = (
r"Specified \\n as separator or delimiter. This forces the python engine "
r"which does not accept a line terminator. Hence it is not allowed to use "
r"the line terminator as separator."
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
@pytest.mark.parametrize("delimiter", [",", "\t"])
def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
# GH: 35958
f = StringIO("a b c\n1 -2 -3\n4 5 6")
parser = all_parsers
msg = (
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)
depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, sep=delimiter)
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
@skip_pyarrow
def test_dict_keys_as_names(all_parsers):
# GH: 36928
data = "1,2"
keys = {"a": int, "b": int}.keys()
parser = all_parsers
result = parser.read_csv(StringIO(data), names=keys)
expected = DataFrame({"a": [1], "b": [2]})
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0
def test_encoding_surrogatepass(all_parsers):
# GH39017
parser = all_parsers
content = b"\xed\xbd\xbf"
decoded = content.decode("utf-8", errors="surrogatepass")
expected = DataFrame({decoded: [decoded]}, index=[decoded * 2])
expected.index.name = decoded * 2
with tm.ensure_clean() as path:
Path(path).write_bytes(
content * 2 + b"," + content + b"\n" + content * 2 + b"," + content
)
df = parser.read_csv(path, encoding_errors="surrogatepass", index_col=0)
tm.assert_frame_equal(df, expected)
with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte"):
parser.read_csv(path)
def test_malformed_second_line(all_parsers):
# see GH14782
parser = all_parsers
data = "\na\nb\n"
result = parser.read_csv(StringIO(data), skip_blank_lines=False, header=1)
expected = DataFrame({"a": ["b"]})
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_short_single_line(all_parsers):
# GH 47566
parser = all_parsers
columns = ["a", "b", "c"]
data = "1,2"
result = parser.read_csv(StringIO(data), header=None, names=columns)
expected = DataFrame({"a": [1], "b": [2], "c": [np.nan]})
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: Length mismatch: Expected axis has 2 elements
def test_short_multi_line(all_parsers):
# GH 47566
parser = all_parsers
columns = ["a", "b", "c"]
data = "1,2\n1,2"
result = parser.read_csv(StringIO(data), header=None, names=columns)
expected = DataFrame({"a": [1, 1], "b": [2, 2], "c": [np.nan, np.nan]})
tm.assert_frame_equal(result, expected)
def test_read_seek(all_parsers):
# GH48646
parser = all_parsers
prefix = "### DATA\n"
content = "nkey,value\ntables,rectangular\n"
with tm.ensure_clean() as path:
Path(path).write_text(prefix + content, encoding="utf-8")
with open(path, encoding="utf-8") as file:
file.readline()
actual = parser.read_csv(file)
expected = parser.read_csv(StringIO(content))
tm.assert_frame_equal(actual, expected)

View File

@ -0,0 +1,91 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
import csv
from io import StringIO
import pytest
from pandas import DataFrame
import pandas._testing as tm
from pandas.io.parsers import TextParser
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
@xfail_pyarrow
def test_read_data_list(all_parsers):
parser = all_parsers
kwargs = {"index_col": 0}
data = "A,B,C\nfoo,1,2,3\nbar,4,5,6"
data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]]
expected = parser.read_csv(StringIO(data), **kwargs)
with TextParser(data_list, chunksize=2, **kwargs) as parser:
result = parser.read()
tm.assert_frame_equal(result, expected)
def test_reader_list(all_parsers):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
kwargs = {"index_col": 0}
lines = list(csv.reader(StringIO(data)))
with TextParser(lines, chunksize=2, **kwargs) as reader:
chunks = list(reader)
expected = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(chunks[0], expected[:2])
tm.assert_frame_equal(chunks[1], expected[2:4])
tm.assert_frame_equal(chunks[2], expected[4:])
def test_reader_list_skiprows(all_parsers):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
kwargs = {"index_col": 0}
lines = list(csv.reader(StringIO(data)))
with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader:
chunks = list(reader)
expected = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(chunks[0], expected[1:3])
def test_read_csv_parse_simple_list(all_parsers):
parser = all_parsers
data = """foo
bar baz
qux foo
foo
bar"""
result = parser.read_csv(StringIO(data), header=None)
expected = DataFrame(["foo", "bar baz", "qux foo", "foo", "bar"])
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,72 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import pytest
from pandas import DataFrame
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.mark.parametrize(
"data,thousands,decimal",
[
(
"""A|B|C
1|2,334.01|5
10|13|10.
""",
",",
".",
),
(
"""A|B|C
1|2.334,01|5
10|13|10,
""",
".",
",",
),
],
)
def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal):
parser = all_parsers
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
if parser.engine == "pyarrow":
msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data), sep="|", thousands=thousands, decimal=decimal
)
return
result = parser.read_csv(
StringIO(data), sep="|", thousands=thousands, decimal=decimal
)
tm.assert_frame_equal(result, expected)
def test_euro_decimal_format(all_parsers):
parser = all_parsers
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,738797819
2;121,12;14897,76;DEF;uyt;0,377320872
3;878,158;108013,434;GHI;rez;2,735694704"""
result = parser.read_csv(StringIO(data), sep=";", decimal=",")
expected = DataFrame(
[
[1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819],
[2, 121.12, 14897.76, "DEF", "uyt", 0.377320872],
[3, 878.158, 108013.434, "GHI", "rez", 2.735694704],
],
columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,478 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import (
BytesIO,
StringIO,
)
import os
import platform
from urllib.error import URLError
import uuid
import numpy as np
import pytest
from pandas.errors import (
EmptyDataError,
ParserError,
)
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@pytest.mark.network
@pytest.mark.single_cpu
def test_url(all_parsers, csv_dir_path, httpserver):
parser = all_parsers
kwargs = {"sep": "\t"}
local_path = os.path.join(csv_dir_path, "salaries.csv")
with open(local_path, encoding="utf-8") as f:
httpserver.serve_content(content=f.read())
url_result = parser.read_csv(httpserver.url, **kwargs)
local_result = parser.read_csv(local_path, **kwargs)
tm.assert_frame_equal(url_result, local_result)
@pytest.mark.slow
def test_local_file(all_parsers, csv_dir_path):
parser = all_parsers
kwargs = {"sep": "\t"}
local_path = os.path.join(csv_dir_path, "salaries.csv")
local_result = parser.read_csv(local_path, **kwargs)
url = "file://localhost/" + local_path
try:
url_result = parser.read_csv(url, **kwargs)
tm.assert_frame_equal(url_result, local_result)
except URLError:
# Fails on some systems.
pytest.skip("Failing on: " + " ".join(platform.uname()))
@xfail_pyarrow # AssertionError: DataFrame.index are different
def test_path_path_lib(all_parsers):
parser = all_parsers
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
)
result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0))
tm.assert_frame_equal(df, result)
@xfail_pyarrow # AssertionError: DataFrame.index are different
def test_path_local_path(all_parsers):
parser = all_parsers
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
)
result = tm.round_trip_localpath(
df.to_csv, lambda p: parser.read_csv(p, index_col=0)
)
tm.assert_frame_equal(df, result)
def test_nonexistent_path(all_parsers):
# gh-2428: pls no segfault
# gh-14086: raise more helpful FileNotFoundError
# GH#29233 "File foo" instead of "File b'foo'"
parser = all_parsers
path = f"{uuid.uuid4()}.csv"
msg = r"\[Errno 2\]"
with pytest.raises(FileNotFoundError, match=msg) as e:
parser.read_csv(path)
assert path == e.value.filename
@td.skip_if_windows # os.chmod does not work in windows
def test_no_permission(all_parsers):
# GH 23784
parser = all_parsers
msg = r"\[Errno 13\]"
with tm.ensure_clean() as path:
os.chmod(path, 0) # make file unreadable
# verify that this process cannot open the file (not running as sudo)
try:
with open(path, encoding="utf-8"):
pass
pytest.skip("Running as sudo.")
except PermissionError:
pass
with pytest.raises(PermissionError, match=msg) as e:
parser.read_csv(path)
assert path == e.value.filename
@pytest.mark.parametrize(
"data,kwargs,expected,msg",
[
# gh-10728: WHITESPACE_LINE
(
"a,b,c\n4,5,6\n ",
{},
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
None,
),
# gh-10548: EAT_LINE_COMMENT
(
"a,b,c\n4,5,6\n#comment",
{"comment": "#"},
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
None,
),
# EAT_CRNL_NOP
(
"a,b,c\n4,5,6\n\r",
{},
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
None,
),
# EAT_COMMENT
(
"a,b,c\n4,5,6#comment",
{"comment": "#"},
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
None,
),
# SKIP_LINE
(
"a,b,c\n4,5,6\nskipme",
{"skiprows": [2]},
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
None,
),
# EAT_LINE_COMMENT
(
"a,b,c\n4,5,6\n#comment",
{"comment": "#", "skip_blank_lines": False},
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
None,
),
# IN_FIELD
(
"a,b,c\n4,5,6\n ",
{"skip_blank_lines": False},
DataFrame([["4", 5, 6], [" ", None, None]], columns=["a", "b", "c"]),
None,
),
# EAT_CRNL
(
"a,b,c\n4,5,6\n\r",
{"skip_blank_lines": False},
DataFrame([[4, 5, 6], [None, None, None]], columns=["a", "b", "c"]),
None,
),
# ESCAPED_CHAR
(
"a,b,c\n4,5,6\n\\",
{"escapechar": "\\"},
None,
"(EOF following escape character)|(unexpected end of data)",
),
# ESCAPE_IN_QUOTED_FIELD
(
'a,b,c\n4,5,6\n"\\',
{"escapechar": "\\"},
None,
"(EOF inside string starting at row 2)|(unexpected end of data)",
),
# IN_QUOTED_FIELD
(
'a,b,c\n4,5,6\n"',
{"escapechar": "\\"},
None,
"(EOF inside string starting at row 2)|(unexpected end of data)",
),
],
ids=[
"whitespace-line",
"eat-line-comment",
"eat-crnl-nop",
"eat-comment",
"skip-line",
"eat-line-comment",
"in-field",
"eat-crnl",
"escaped-char",
"escape-in-quoted-field",
"in-quoted-field",
],
)
def test_eof_states(all_parsers, data, kwargs, expected, msg, request):
# see gh-10728, gh-10548
parser = all_parsers
if parser.engine == "pyarrow" and "comment" in kwargs:
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
return
if parser.engine == "pyarrow" and "\r" not in data:
# pandas.errors.ParserError: CSV parse error: Expected 3 columns, got 1:
# ValueError: skiprows argument must be an integer when using engine='pyarrow'
# AssertionError: Regex pattern did not match.
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
if expected is None:
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
else:
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
def test_temporary_file(all_parsers):
# see gh-13398
parser = all_parsers
data = "0 0"
with tm.ensure_clean(mode="w+", return_filelike=True) as new_file:
new_file.write(data)
new_file.flush()
new_file.seek(0)
if parser.engine == "pyarrow":
msg = "the 'pyarrow' engine does not support regex separators"
with pytest.raises(ValueError, match=msg):
parser.read_csv(new_file, sep=r"\s+", header=None)
return
result = parser.read_csv(new_file, sep=r"\s+", header=None)
expected = DataFrame([[0, 0]])
tm.assert_frame_equal(result, expected)
def test_internal_eof_byte(all_parsers):
# see gh-5500
parser = all_parsers
data = "a,b\n1\x1a,2"
expected = DataFrame([["1\x1a", 2]], columns=["a", "b"])
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
def test_internal_eof_byte_to_file(all_parsers):
# see gh-16559
parser = all_parsers
data = b'c1,c2\r\n"test \x1a test", test\r\n'
expected = DataFrame([["test \x1a test", " test"]], columns=["c1", "c2"])
path = f"__{uuid.uuid4()}__.csv"
with tm.ensure_clean(path) as path:
with open(path, "wb") as f:
f.write(data)
result = parser.read_csv(path)
tm.assert_frame_equal(result, expected)
def test_file_handle_string_io(all_parsers):
# gh-14418
#
# Don't close user provided file handles.
parser = all_parsers
data = "a,b\n1,2"
fh = StringIO(data)
parser.read_csv(fh)
assert not fh.closed
def test_file_handles_with_open(all_parsers, csv1):
# gh-14418
#
# Don't close user provided file handles.
parser = all_parsers
for mode in ["r", "rb"]:
with open(csv1, mode, encoding="utf-8" if mode == "r" else None) as f:
parser.read_csv(f)
assert not f.closed
def test_invalid_file_buffer_class(all_parsers):
# see gh-15337
class InvalidBuffer:
pass
parser = all_parsers
msg = "Invalid file path or buffer object type"
with pytest.raises(ValueError, match=msg):
parser.read_csv(InvalidBuffer())
def test_invalid_file_buffer_mock(all_parsers):
# see gh-15337
parser = all_parsers
msg = "Invalid file path or buffer object type"
class Foo:
pass
with pytest.raises(ValueError, match=msg):
parser.read_csv(Foo())
def test_valid_file_buffer_seems_invalid(all_parsers):
# gh-16135: we want to ensure that "tell" and "seek"
# aren't actually being used when we call `read_csv`
#
# Thus, while the object may look "invalid" (these
# methods are attributes of the `StringIO` class),
# it is still a valid file-object for our purposes.
class NoSeekTellBuffer(StringIO):
def tell(self):
raise AttributeError("No tell method")
def seek(self, pos, whence=0):
raise AttributeError("No seek method")
data = "a\n1"
parser = all_parsers
expected = DataFrame({"a": [1]})
result = parser.read_csv(NoSeekTellBuffer(data))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("io_class", [StringIO, BytesIO])
@pytest.mark.parametrize("encoding", [None, "utf-8"])
def test_read_csv_file_handle(all_parsers, io_class, encoding):
"""
Test whether read_csv does not close user-provided file handles.
GH 36980
"""
parser = all_parsers
expected = DataFrame({"a": [1], "b": [2]})
content = "a,b\n1,2"
handle = io_class(content.encode("utf-8") if io_class == BytesIO else content)
tm.assert_frame_equal(parser.read_csv(handle, encoding=encoding), expected)
assert not handle.closed
def test_memory_map_compression(all_parsers, compression):
"""
Support memory map for compressed files.
GH 37621
"""
parser = all_parsers
expected = DataFrame({"a": [1], "b": [2]})
with tm.ensure_clean() as path:
expected.to_csv(path, index=False, compression=compression)
if parser.engine == "pyarrow":
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(path, memory_map=True, compression=compression)
return
result = parser.read_csv(path, memory_map=True, compression=compression)
tm.assert_frame_equal(
result,
expected,
)
def test_context_manager(all_parsers, datapath):
# make sure that opened files are closed
parser = all_parsers
path = datapath("io", "data", "csv", "iris.csv")
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(path, chunksize=1)
return
reader = parser.read_csv(path, chunksize=1)
assert not reader.handles.handle.closed
try:
with reader:
next(reader)
assert False
except AssertionError:
assert reader.handles.handle.closed
def test_context_manageri_user_provided(all_parsers, datapath):
# make sure that user-provided handles are not closed
parser = all_parsers
with open(datapath("io", "data", "csv", "iris.csv"), encoding="utf-8") as path:
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(path, chunksize=1)
return
reader = parser.read_csv(path, chunksize=1)
assert not reader.handles.handle.closed
try:
with reader:
next(reader)
assert False
except AssertionError:
assert not reader.handles.handle.closed
@skip_pyarrow # ParserError: Empty CSV file
def test_file_descriptor_leak(all_parsers, using_copy_on_write):
# GH 31488
parser = all_parsers
with tm.ensure_clean() as path:
with pytest.raises(EmptyDataError, match="No columns to parse from file"):
parser.read_csv(path)
def test_memory_map(all_parsers, csv_dir_path):
mmap_file = os.path.join(csv_dir_path, "test_mmap.csv")
parser = all_parsers
expected = DataFrame(
{"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]}
)
if parser.engine == "pyarrow":
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(mmap_file, memory_map=True)
return
result = parser.read_csv(mmap_file, memory_map=True)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,79 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import numpy as np
import pytest
from pandas.compat import is_platform_linux
from pandas import DataFrame
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@skip_pyarrow # ParserError: CSV parse error: Empty CSV file or block
def test_float_parser(all_parsers):
# see gh-9565
parser = all_parsers
data = "45e-1,4.5,45.,inf,-inf"
result = parser.read_csv(StringIO(data), header=None)
expected = DataFrame([[float(s) for s in data.split(",")]])
tm.assert_frame_equal(result, expected)
def test_scientific_no_exponent(all_parsers_all_precisions):
# see gh-12215
df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]})
data = df.to_csv(index=False)
parser, precision = all_parsers_all_precisions
df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
tm.assert_frame_equal(df_roundtrip, df)
@pytest.mark.parametrize(
"neg_exp",
[
-617,
-100000,
pytest.param(-99999999999999999, marks=pytest.mark.skip_ubsan),
],
)
def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
# GH#38753
parser, precision = all_parsers_all_precisions
data = f"data\n10E{neg_exp}"
result = parser.read_csv(StringIO(data), float_precision=precision)
expected = DataFrame({"data": [0.0]})
tm.assert_frame_equal(result, expected)
@pytest.mark.skip_ubsan
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
# GH#38753
parser, precision = all_parsers_all_precisions
data = f"data\n10E{exp}"
result = parser.read_csv(StringIO(data), float_precision=precision)
if precision == "round_trip":
if exp == 999999999999999999 and is_platform_linux():
mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
request.applymarker(mark)
value = np.inf if exp > 0 else 0.0
expected = DataFrame({"data": [value]})
else:
expected = DataFrame({"data": [f"10E{exp}"]})
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,302 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from datetime import datetime
from io import StringIO
import os
import pytest
from pandas import (
DataFrame,
Index,
MultiIndex,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@pytest.mark.parametrize(
"data,kwargs,expected",
[
(
"""foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
""",
{"index_col": 0, "names": ["index", "A", "B", "C", "D"]},
DataFrame(
[
[2, 3, 4, 5],
[7, 8, 9, 10],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
],
index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"),
columns=["A", "B", "C", "D"],
),
),
(
"""foo,one,2,3,4,5
foo,two,7,8,9,10
foo,three,12,13,14,15
bar,one,12,13,14,15
bar,two,12,13,14,15
""",
{"index_col": [0, 1], "names": ["index1", "index2", "A", "B", "C", "D"]},
DataFrame(
[
[2, 3, 4, 5],
[7, 8, 9, 10],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
],
index=MultiIndex.from_tuples(
[
("foo", "one"),
("foo", "two"),
("foo", "three"),
("bar", "one"),
("bar", "two"),
],
names=["index1", "index2"],
),
columns=["A", "B", "C", "D"],
),
),
],
)
def test_pass_names_with_index(all_parsers, data, kwargs, expected):
parser = all_parsers
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
def test_multi_index_no_level_names(all_parsers, index_col):
data = """index1,index2,A,B,C,D
foo,one,2,3,4,5
foo,two,7,8,9,10
foo,three,12,13,14,15
bar,one,12,13,14,15
bar,two,12,13,14,15
"""
headless_data = "\n".join(data.split("\n")[1:])
names = ["A", "B", "C", "D"]
parser = all_parsers
result = parser.read_csv(
StringIO(headless_data), index_col=index_col, header=None, names=names
)
expected = parser.read_csv(StringIO(data), index_col=index_col)
# No index names in headless data.
expected.index.names = [None] * 2
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_multi_index_no_level_names_implicit(all_parsers):
parser = all_parsers
data = """A,B,C,D
foo,one,2,3,4,5
foo,two,7,8,9,10
foo,three,12,13,14,15
bar,one,12,13,14,15
bar,two,12,13,14,15
"""
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[
[2, 3, 4, 5],
[7, 8, 9, 10],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
],
columns=["A", "B", "C", "D"],
index=MultiIndex.from_tuples(
[
("foo", "one"),
("foo", "two"),
("foo", "three"),
("bar", "one"),
("bar", "two"),
]
),
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
@pytest.mark.parametrize(
"data,expected,header",
[
("a,b", DataFrame(columns=["a", "b"]), [0]),
(
"a,b\nc,d",
DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])),
[0, 1],
),
],
)
@pytest.mark.parametrize("round_trip", [True, False])
def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip):
# see gh-14545
parser = all_parsers
data = expected.to_csv(index=False) if round_trip else data
result = parser.read_csv(StringIO(data), header=header)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # AssertionError: DataFrame.columns are different
def test_no_unnamed_index(all_parsers):
parser = all_parsers
data = """ id c0 c1 c2
0 1 0 a b
1 2 0 c d
2 2 2 e f
"""
result = parser.read_csv(StringIO(data), sep=" ")
expected = DataFrame(
[[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]],
columns=["Unnamed: 0", "id", "c0", "c1", "c2"],
)
tm.assert_frame_equal(result, expected)
def test_read_duplicate_index_explicit(all_parsers):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo,12,13,14,15
bar,12,13,14,15
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=0)
expected = DataFrame(
[
[2, 3, 4, 5],
[7, 8, 9, 10],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
],
columns=["A", "B", "C", "D"],
index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"),
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_read_duplicate_index_implicit(all_parsers):
data = """A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo,12,13,14,15
bar,12,13,14,15
"""
parser = all_parsers
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[
[2, 3, 4, 5],
[7, 8, 9, 10],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
],
columns=["A", "B", "C", "D"],
index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]),
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_read_csv_no_index_name(all_parsers, csv_dir_path):
parser = all_parsers
csv2 = os.path.join(csv_dir_path, "test2.csv")
result = parser.read_csv(csv2, index_col=0, parse_dates=True)
expected = DataFrame(
[
[0.980269, 3.685731, -0.364216805298, -1.159738, "foo"],
[1.047916, -0.041232, -0.16181208307, 0.212549, "bar"],
[0.498581, 0.731168, -0.537677223318, 1.346270, "baz"],
[1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"],
[-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"],
],
columns=["A", "B", "C", "D", "E"],
index=Index(
[
datetime(2000, 1, 3),
datetime(2000, 1, 4),
datetime(2000, 1, 5),
datetime(2000, 1, 6),
datetime(2000, 1, 7),
]
),
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_empty_with_index(all_parsers):
# see gh-10184
data = "x,y"
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=0)
expected = DataFrame(columns=["y"], index=Index([], name="x"))
tm.assert_frame_equal(result, expected)
# CSV parse error: Empty CSV file or block: cannot infer number of columns
@skip_pyarrow
def test_empty_with_multi_index(all_parsers):
# see gh-10467
data = "x,y,z"
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=["x", "y"])
expected = DataFrame(
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"])
)
tm.assert_frame_equal(result, expected)
# CSV parse error: Empty CSV file or block: cannot infer number of columns
@skip_pyarrow
def test_empty_with_reversed_multi_index(all_parsers):
data = "x,y,z"
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=[1, 0])
expected = DataFrame(
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"])
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,78 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import numpy as np
import pytest
from pandas import (
DataFrame,
option_context,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
@xfail_pyarrow # AssertionError: DataFrame.index are different
@pytest.mark.parametrize("na_filter", [True, False])
def test_inf_parsing(all_parsers, na_filter):
parser = all_parsers
data = """\
,A
a,inf
b,-inf
c,+Inf
d,-Inf
e,INF
f,-INF
g,+INf
h,-INf
i,inF
j,-inF"""
expected = DataFrame(
{"A": [float("inf"), float("-inf")] * 5},
index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
)
result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # AssertionError: DataFrame.index are different
@pytest.mark.parametrize("na_filter", [True, False])
def test_infinity_parsing(all_parsers, na_filter):
parser = all_parsers
data = """\
,A
a,Infinity
b,-Infinity
c,+Infinity
"""
expected = DataFrame(
{"A": [float("infinity"), float("-infinity"), float("+infinity")]},
index=["a", "b", "c"],
)
result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
tm.assert_frame_equal(result, expected)
def test_read_csv_with_use_inf_as_na(all_parsers):
# https://github.com/pandas-dev/pandas/issues/35493
parser = all_parsers
data = "1.0\nNaN\n3.0"
msg = "use_inf_as_na option is deprecated"
warn = FutureWarning
if parser.engine == "pyarrow":
warn = (FutureWarning, DeprecationWarning)
with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
with option_context("use_inf_as_na", True):
result = parser.read_csv(StringIO(data), header=None)
expected = DataFrame([1.0, np.nan, 3.0])
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,231 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import numpy as np
import pytest
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
def test_int_conversion(all_parsers):
data = """A,B
1.0,1
2.0,2
3.0,3
"""
parser = all_parsers
result = parser.read_csv(StringIO(data))
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,kwargs,expected",
[
(
"A,B\nTrue,1\nFalse,2\nTrue,3",
{},
DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
),
(
"A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3",
{"true_values": ["yes", "Yes", "YES"], "false_values": ["no", "NO", "No"]},
DataFrame(
[[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]],
columns=["A", "B"],
),
),
(
"A,B\nTRUE,1\nFALSE,2\nTRUE,3",
{},
DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
),
(
"A,B\nfoo,bar\nbar,foo",
{"true_values": ["foo"], "false_values": ["bar"]},
DataFrame([[True, False], [False, True]], columns=["A", "B"]),
),
],
)
def test_parse_bool(all_parsers, data, kwargs, expected):
parser = all_parsers
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
def test_parse_integers_above_fp_precision(all_parsers):
data = """Numbers
17007000002000191
17007000002000191
17007000002000191
17007000002000191
17007000002000192
17007000002000192
17007000002000192
17007000002000192
17007000002000192
17007000002000194"""
parser = all_parsers
result = parser.read_csv(StringIO(data))
expected = DataFrame(
{
"Numbers": [
17007000002000191,
17007000002000191,
17007000002000191,
17007000002000191,
17007000002000192,
17007000002000192,
17007000002000192,
17007000002000192,
17007000002000192,
17007000002000194,
]
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("sep", [" ", r"\s+"])
def test_integer_overflow_bug(all_parsers, sep):
# see gh-2601
data = "65248E10 11\n55555E55 22\n"
parser = all_parsers
if parser.engine == "pyarrow" and sep != " ":
msg = "the 'pyarrow' engine does not support regex separators"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), header=None, sep=sep)
return
result = parser.read_csv(StringIO(data), header=None, sep=sep)
expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]])
tm.assert_frame_equal(result, expected)
def test_int64_min_issues(all_parsers):
# see gh-2599
parser = all_parsers
data = "A,B\n0,0\n0,"
result = parser.read_csv(StringIO(data))
expected = DataFrame({"A": [0, 0], "B": [0, np.nan]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
def test_int64_overflow(all_parsers, conv, request):
data = """ID
00013007854817840016671868
00013007854817840016749251
00013007854817840016754630
00013007854817840016781876
00013007854817840017028824
00013007854817840017963235
00013007854817840018860166"""
parser = all_parsers
if conv is None:
# 13007854817840016671868 > UINT64_MAX, so this
# will overflow and return object as the dtype.
if parser.engine == "pyarrow":
mark = pytest.mark.xfail(reason="parses to float64")
request.applymarker(mark)
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[
"00013007854817840016671868",
"00013007854817840016749251",
"00013007854817840016754630",
"00013007854817840016781876",
"00013007854817840017028824",
"00013007854817840017963235",
"00013007854817840018860166",
],
columns=["ID"],
)
tm.assert_frame_equal(result, expected)
else:
# 13007854817840016671868 > UINT64_MAX, so attempts
# to cast to either int64 or uint64 will result in
# an OverflowError being raised.
msg = "|".join(
[
"Python int too large to convert to C long",
"long too big to convert",
"int too big to convert",
]
)
err = OverflowError
if parser.engine == "pyarrow":
err = ValueError
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(err, match=msg):
parser.read_csv(StringIO(data), converters={"ID": conv})
@skip_pyarrow # CSV parse error: Empty CSV file or block
@pytest.mark.parametrize(
"val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min]
)
def test_int64_uint64_range(all_parsers, val):
# These numbers fall right inside the int64-uint64
# range, so they should be parsed as string.
parser = all_parsers
result = parser.read_csv(StringIO(str(val)), header=None)
expected = DataFrame([val])
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
@pytest.mark.parametrize(
"val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
)
def test_outside_int64_uint64_range(all_parsers, val):
# These numbers fall just outside the int64-uint64
# range, so they should be parsed as string.
parser = all_parsers
result = parser.read_csv(StringIO(str(val)), header=None)
expected = DataFrame([str(val)])
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # gets float64 dtype instead of object
@pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], [str(2**63), str(-1)]])
def test_numeric_range_too_wide(all_parsers, exp_data):
# No numerical dtype can hold both negative and uint64
# values, so they should be cast as string.
parser = all_parsers
data = "\n".join(exp_data)
expected = DataFrame(exp_data)
result = parser.read_csv(StringIO(data), header=None)
tm.assert_frame_equal(result, expected)
def test_integer_precision(all_parsers):
# Gh 7072
s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765
5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389"""
parser = all_parsers
result = parser.read_csv(StringIO(s), header=None)[4]
expected = Series([4321583677327450765, 4321113141090630389], name=4)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,134 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import pytest
from pandas import (
DataFrame,
concat,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
def test_iterator(all_parsers):
# see gh-6607
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
kwargs = {"index_col": 0}
expected = parser.read_csv(StringIO(data), **kwargs)
if parser.engine == "pyarrow":
msg = "The 'iterator' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), iterator=True, **kwargs)
return
with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader:
first_chunk = reader.read(3)
tm.assert_frame_equal(first_chunk, expected[:3])
last_chunk = reader.read(5)
tm.assert_frame_equal(last_chunk, expected[3:])
def test_iterator2(all_parsers):
parser = all_parsers
data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
if parser.engine == "pyarrow":
msg = "The 'iterator' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), iterator=True)
return
with parser.read_csv(StringIO(data), iterator=True) as reader:
result = list(reader)
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=["foo", "bar", "baz"],
columns=["A", "B", "C"],
)
tm.assert_frame_equal(result[0], expected)
def test_iterator_stop_on_chunksize(all_parsers):
# gh-3967: stopping iteration when chunksize is specified
parser = all_parsers
data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), chunksize=1)
return
with parser.read_csv(StringIO(data), chunksize=1) as reader:
result = list(reader)
assert len(result) == 3
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=["foo", "bar", "baz"],
columns=["A", "B", "C"],
)
tm.assert_frame_equal(concat(result), expected)
@pytest.mark.parametrize(
"kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}]
)
def test_iterator_skipfooter_errors(all_parsers, kwargs):
msg = "'skipfooter' not supported for iteration"
parser = all_parsers
data = "a\n1\n2"
if parser.engine == "pyarrow":
msg = (
"The '(chunksize|iterator)' option is not supported with the "
"'pyarrow' engine"
)
with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _:
pass
def test_iteration_open_handle(all_parsers):
parser = all_parsers
kwargs = {"header": None}
with tm.ensure_clean() as path:
with open(path, "w", encoding="utf-8") as f:
f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG")
with open(path, encoding="utf-8") as f:
for line in f:
if "CCC" in line:
break
result = parser.read_csv(f, **kwargs)
expected = DataFrame({0: ["DDD", "EEE", "FFF", "GGG"]})
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,320 @@
"""
Tests that work on the Python, C and PyArrow engines but do not have a
specific classification into the other test modules.
"""
import codecs
import csv
from io import StringIO
import os
from pathlib import Path
import numpy as np
import pytest
from pandas.compat import PY311
from pandas.errors import (
EmptyDataError,
ParserError,
ParserWarning,
)
from pandas import DataFrame
import pandas._testing as tm
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
def test_empty_decimal_marker(all_parsers):
data = """A|B|C
1|2,334|5
10|13|10.
"""
# Parsers support only length-1 decimals
msg = "Only length-1 decimal markers supported"
parser = all_parsers
if parser.engine == "pyarrow":
msg = (
"only single character unicode strings can be "
"converted to Py_UCS4, got length 0"
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), decimal="")
def test_bad_stream_exception(all_parsers, csv_dir_path):
# see gh-13652
#
# This test validates that both the Python engine and C engine will
# raise UnicodeDecodeError instead of C engine raising ParserError
# and swallowing the exception that caused read to fail.
path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv")
codec = codecs.lookup("utf-8")
utf8 = codecs.lookup("utf-8")
parser = all_parsers
msg = "'utf-8' codec can't decode byte"
# Stream must be binary UTF8.
with open(path, "rb") as handle, codecs.StreamRecoder(
handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter
) as stream:
with pytest.raises(UnicodeDecodeError, match=msg):
parser.read_csv(stream)
def test_malformed(all_parsers):
# see gh-6607
parser = all_parsers
data = """ignore
A,B,C
1,2,3 # comment
1,2,3,4,5
2,3,4
"""
msg = "Expected 3 fields in line 4, saw 5"
err = ParserError
if parser.engine == "pyarrow":
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
err = ValueError
with pytest.raises(err, match=msg):
parser.read_csv(StringIO(data), header=1, comment="#")
@pytest.mark.parametrize("nrows", [5, 3, None])
def test_malformed_chunks(all_parsers, nrows):
data = """ignore
A,B,C
skip
1,2,3
3,5,10 # comment
1,2,3,4,5
2,3,4
"""
parser = all_parsers
if parser.engine == "pyarrow":
msg = "The 'iterator' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data),
header=1,
comment="#",
iterator=True,
chunksize=1,
skiprows=[2],
)
return
msg = "Expected 3 fields in line 6, saw 5"
with parser.read_csv(
StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2]
) as reader:
with pytest.raises(ParserError, match=msg):
reader.read(nrows)
@xfail_pyarrow # does not raise
def test_catch_too_many_names(all_parsers):
# see gh-5156
data = """\
1,2,3
4,,6
7,8,9
10,11,12\n"""
parser = all_parsers
msg = (
"Too many columns specified: expected 4 and found 3"
if parser.engine == "c"
else "Number of passed names did not match "
"number of header fields in the file"
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"])
@skip_pyarrow # CSV parse error: Empty CSV file or block
@pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5])
def test_raise_on_no_columns(all_parsers, nrows):
parser = all_parsers
data = "\n" * nrows
msg = "No columns to parse from file"
with pytest.raises(EmptyDataError, match=msg):
parser.read_csv(StringIO(data))
def test_unexpected_keyword_parameter_exception(all_parsers):
# GH-34976
parser = all_parsers
msg = "{}\\(\\) got an unexpected keyword argument 'foo'"
with pytest.raises(TypeError, match=msg.format("read_csv")):
parser.read_csv("foo.csv", foo=1)
with pytest.raises(TypeError, match=msg.format("read_table")):
parser.read_table("foo.tsv", foo=1)
def test_suppress_error_output(all_parsers):
# see gh-15925
parser = all_parsers
data = "a\n1\n1,2,3\n4\n5,6,7"
expected = DataFrame({"a": [1, 4]})
result = parser.read_csv(StringIO(data), on_bad_lines="skip")
tm.assert_frame_equal(result, expected)
def test_error_bad_lines(all_parsers):
# see gh-15925
parser = all_parsers
data = "a\n1\n1,2,3\n4\n5,6,7"
msg = "Expected 1 fields in line 3, saw 3"
if parser.engine == "pyarrow":
# "CSV parse error: Expected 1 columns, got 3: 1,2,3"
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), on_bad_lines="error")
def test_warn_bad_lines(all_parsers):
# see gh-15925
parser = all_parsers
data = "a\n1\n1,2,3\n4\n5,6,7"
expected = DataFrame({"a": [1, 4]})
match_msg = "Skipping line"
expected_warning = ParserWarning
if parser.engine == "pyarrow":
match_msg = "Expected 1 columns, but found 3: 1,2,3"
expected_warning = (ParserWarning, DeprecationWarning)
with tm.assert_produces_warning(
expected_warning, match=match_msg, check_stacklevel=False
):
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
tm.assert_frame_equal(result, expected)
def test_read_csv_wrong_num_columns(all_parsers):
# Too few columns.
data = """A,B,C,D,E,F
1,2,3,4,5,6
6,7,8,9,10,11,12
11,12,13,14,15,16
"""
parser = all_parsers
msg = "Expected 6 fields in line 3, saw 7"
if parser.engine == "pyarrow":
# Expected 6 columns, got 7: 6,7,8,9,10,11,12
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data))
def test_null_byte_char(request, all_parsers):
# see gh-2741
data = "\x00,foo"
names = ["a", "b"]
parser = all_parsers
if parser.engine == "c" or (parser.engine == "python" and PY311):
if parser.engine == "python" and PY311:
request.applymarker(
pytest.mark.xfail(
reason="In Python 3.11, this is read as an empty character not null"
)
)
expected = DataFrame([[np.nan, "foo"]], columns=names)
out = parser.read_csv(StringIO(data), names=names)
tm.assert_frame_equal(out, expected)
else:
if parser.engine == "pyarrow":
# CSV parse error: Empty CSV file or block: "
# cannot infer number of columns"
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
else:
msg = "NULL byte detected"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), names=names)
@pytest.mark.filterwarnings("always::ResourceWarning")
def test_open_file(request, all_parsers):
# GH 39024
parser = all_parsers
msg = "Could not determine delimiter"
err = csv.Error
if parser.engine == "c":
msg = "the 'c' engine does not support sep=None with delim_whitespace=False"
err = ValueError
elif parser.engine == "pyarrow":
msg = (
"the 'pyarrow' engine does not support sep=None with delim_whitespace=False"
)
err = ValueError
with tm.ensure_clean() as path:
file = Path(path)
file.write_bytes(b"\xe4\na\n1")
with tm.assert_produces_warning(None):
# should not trigger a ResourceWarning
with pytest.raises(err, match=msg):
parser.read_csv(file, sep=None, encoding_errors="replace")
def test_invalid_on_bad_line(all_parsers):
parser = all_parsers
data = "a\n1\n1,2,3\n4\n5,6,7"
with pytest.raises(ValueError, match="Argument abc is invalid for on_bad_lines"):
parser.read_csv(StringIO(data), on_bad_lines="abc")
def test_bad_header_uniform_error(all_parsers):
parser = all_parsers
data = "+++123456789...\ncol1,col2,col3,col4\n1,2,3,4\n"
msg = "Expected 2 fields in line 2, saw 4"
if parser.engine == "c":
msg = (
"Could not construct index. Requested to use 1 "
"number of columns, but 3 left to parse."
)
elif parser.engine == "pyarrow":
# "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4"
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error")
def test_on_bad_lines_warn_correct_formatting(all_parsers):
# see gh-15925
parser = all_parsers
data = """1,2
a,b
a,b,c
a,b,d
a,b
"""
expected = DataFrame({"1": "a", "2": ["b"] * 2})
match_msg = "Skipping line"
expected_warning = ParserWarning
if parser.engine == "pyarrow":
match_msg = "Expected 2 columns, but found 3: a,b,c"
expected_warning = (ParserWarning, DeprecationWarning)
with tm.assert_produces_warning(
expected_warning, match=match_msg, check_stacklevel=False
):
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,81 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import pytest
import pandas._testing as tm
depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated"
def test_verbose_read(all_parsers, capsys):
parser = all_parsers
data = """a,b,c,d
one,1,2,3
one,1,2,3
,1,2,3
one,1,2,3
,1,2,3
,1,2,3
one,1,2,3
two,1,2,3"""
if parser.engine == "pyarrow":
msg = "The 'verbose' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), verbose=True)
return
# Engines are verbose in different ways.
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), verbose=True)
captured = capsys.readouterr()
if parser.engine == "c":
assert "Tokenization took:" in captured.out
assert "Parser memory cleanup took:" in captured.out
else: # Python engine
assert captured.out == "Filled 3 NA values in column a\n"
def test_verbose_read2(all_parsers, capsys):
parser = all_parsers
data = """a,b,c,d
one,1,2,3
two,1,2,3
three,1,2,3
four,1,2,3
five,1,2,3
,1,2,3
seven,1,2,3
eight,1,2,3"""
if parser.engine == "pyarrow":
msg = "The 'verbose' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), verbose=True, index_col=0)
return
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), verbose=True, index_col=0)
captured = capsys.readouterr()
# Engines are verbose in different ways.
if parser.engine == "c":
assert "Tokenization took:" in captured.out
assert "Parser memory cleanup took:" in captured.out
else: # Python engine
assert captured.out == "Filled 1 NA values in column a\n"

View File

@ -0,0 +1,319 @@
from __future__ import annotations
import os
import pytest
from pandas.compat._optional import VERSIONS
from pandas import (
read_csv,
read_table,
)
import pandas._testing as tm
class BaseParser:
engine: str | None = None
low_memory = True
float_precision_choices: list[str | None] = []
def update_kwargs(self, kwargs):
kwargs = kwargs.copy()
kwargs.update({"engine": self.engine, "low_memory": self.low_memory})
return kwargs
def read_csv(self, *args, **kwargs):
kwargs = self.update_kwargs(kwargs)
return read_csv(*args, **kwargs)
def read_csv_check_warnings(
self,
warn_type: type[Warning],
warn_msg: str,
*args,
raise_on_extra_warnings=True,
check_stacklevel: bool = True,
**kwargs,
):
# We need to check the stacklevel here instead of in the tests
# since this is where read_csv is called and where the warning
# should point to.
kwargs = self.update_kwargs(kwargs)
with tm.assert_produces_warning(
warn_type,
match=warn_msg,
raise_on_extra_warnings=raise_on_extra_warnings,
check_stacklevel=check_stacklevel,
):
return read_csv(*args, **kwargs)
def read_table(self, *args, **kwargs):
kwargs = self.update_kwargs(kwargs)
return read_table(*args, **kwargs)
def read_table_check_warnings(
self,
warn_type: type[Warning],
warn_msg: str,
*args,
raise_on_extra_warnings=True,
**kwargs,
):
# We need to check the stacklevel here instead of in the tests
# since this is where read_table is called and where the warning
# should point to.
kwargs = self.update_kwargs(kwargs)
with tm.assert_produces_warning(
warn_type, match=warn_msg, raise_on_extra_warnings=raise_on_extra_warnings
):
return read_table(*args, **kwargs)
class CParser(BaseParser):
engine = "c"
float_precision_choices = [None, "high", "round_trip"]
class CParserHighMemory(CParser):
low_memory = False
class CParserLowMemory(CParser):
low_memory = True
class PythonParser(BaseParser):
engine = "python"
float_precision_choices = [None]
class PyArrowParser(BaseParser):
engine = "pyarrow"
float_precision_choices = [None]
@pytest.fixture
def csv_dir_path(datapath):
"""
The directory path to the data files needed for parser tests.
"""
return datapath("io", "parser", "data")
@pytest.fixture
def csv1(datapath):
"""
The path to the data file "test1.csv" needed for parser tests.
"""
return os.path.join(datapath("io", "data", "csv"), "test1.csv")
_cParserHighMemory = CParserHighMemory
_cParserLowMemory = CParserLowMemory
_pythonParser = PythonParser
_pyarrowParser = PyArrowParser
_py_parsers_only = [_pythonParser]
_c_parsers_only = [_cParserHighMemory, _cParserLowMemory]
_pyarrow_parsers_only = [pytest.param(_pyarrowParser, marks=pytest.mark.single_cpu)]
_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
_py_parser_ids = ["python"]
_c_parser_ids = ["c_high", "c_low"]
_pyarrow_parsers_ids = ["pyarrow"]
_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parsers_ids]
@pytest.fixture(params=_all_parsers, ids=_all_parser_ids)
def all_parsers(request):
"""
Fixture all of the CSV parsers.
"""
parser = request.param()
if parser.engine == "pyarrow":
pytest.importorskip("pyarrow", VERSIONS["pyarrow"])
# Try finding a way to disable threads all together
# for more stable CI runs
import pyarrow
pyarrow.set_cpu_count(1)
return parser
@pytest.fixture(params=_c_parsers_only, ids=_c_parser_ids)
def c_parser_only(request):
"""
Fixture all of the CSV parsers using the C engine.
"""
return request.param()
@pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids)
def python_parser_only(request):
"""
Fixture all of the CSV parsers using the Python engine.
"""
return request.param()
@pytest.fixture(params=_pyarrow_parsers_only, ids=_pyarrow_parsers_ids)
def pyarrow_parser_only(request):
"""
Fixture all of the CSV parsers using the Pyarrow engine.
"""
return request.param()
def _get_all_parser_float_precision_combinations():
"""
Return all allowable parser and float precision
combinations and corresponding ids.
"""
params = []
ids = []
for parser, parser_id in zip(_all_parsers, _all_parser_ids):
if hasattr(parser, "values"):
# Wrapped in pytest.param, get the actual parser back
parser = parser.values[0]
for precision in parser.float_precision_choices:
# Re-wrap in pytest.param for pyarrow
mark = pytest.mark.single_cpu if parser.engine == "pyarrow" else ()
param = pytest.param((parser(), precision), marks=mark)
params.append(param)
ids.append(f"{parser_id}-{precision}")
return {"params": params, "ids": ids}
@pytest.fixture(
params=_get_all_parser_float_precision_combinations()["params"],
ids=_get_all_parser_float_precision_combinations()["ids"],
)
def all_parsers_all_precisions(request):
"""
Fixture for all allowable combinations of parser
and float precision
"""
return request.param
_utf_values = [8, 16, 32]
_encoding_seps = ["", "-", "_"]
_encoding_prefixes = ["utf", "UTF"]
_encoding_fmts = [
f"{prefix}{sep}{{0}}" for sep in _encoding_seps for prefix in _encoding_prefixes
]
@pytest.fixture(params=_utf_values)
def utf_value(request):
"""
Fixture for all possible integer values for a UTF encoding.
"""
return request.param
@pytest.fixture(params=_encoding_fmts)
def encoding_fmt(request):
"""
Fixture for all possible string formats of a UTF encoding.
"""
return request.param
@pytest.fixture(
params=[
("-1,0", -1.0),
("-1,2e0", -1.2),
("-1e0", -1.0),
("+1e0", 1.0),
("+1e+0", 1.0),
("+1e-1", 0.1),
("+,1e1", 1.0),
("+1,e0", 1.0),
("-,1e1", -1.0),
("-1,e0", -1.0),
("0,1", 0.1),
("1,", 1.0),
(",1", 0.1),
("-,1", -0.1),
("1_,", 1.0),
("1_234,56", 1234.56),
("1_234,56e0", 1234.56),
# negative cases; must not parse as float
("_", "_"),
("-_", "-_"),
("-_1", "-_1"),
("-_1e0", "-_1e0"),
("_1", "_1"),
("_1,", "_1,"),
("_1,_", "_1,_"),
("_1e0", "_1e0"),
("1,2e_1", "1,2e_1"),
("1,2e1_0", "1,2e1_0"),
("1,_2", "1,_2"),
(",1__2", ",1__2"),
(",1e", ",1e"),
("-,1e", "-,1e"),
("1_000,000_000", "1_000,000_000"),
("1,e1_2", "1,e1_2"),
("e11,2", "e11,2"),
("1e11,2", "1e11,2"),
("1,2,2", "1,2,2"),
("1,2_1", "1,2_1"),
("1,2e-10e1", "1,2e-10e1"),
("--1,2", "--1,2"),
("1a_2,1", "1a_2,1"),
("1,2E-1", 0.12),
("1,2E1", 12.0),
]
)
def numeric_decimal(request):
"""
Fixture for all numeric formats which should get recognized. The first entry
represents the value to read while the second represents the expected result.
"""
return request.param
@pytest.fixture
def pyarrow_xfail(request):
"""
Fixture that xfails a test if the engine is pyarrow.
Use if failure is do to unsupported keywords or inconsistent results.
"""
if "all_parsers" in request.fixturenames:
parser = request.getfixturevalue("all_parsers")
elif "all_parsers_all_precisions" in request.fixturenames:
# Return value is tuple of (engine, precision)
parser = request.getfixturevalue("all_parsers_all_precisions")[0]
else:
return
if parser.engine == "pyarrow":
mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
request.applymarker(mark)
@pytest.fixture
def pyarrow_skip(request):
"""
Fixture that skips a test if the engine is pyarrow.
Use if failure is do a parsing failure from pyarrow.csv.read_csv
"""
if "all_parsers" in request.fixturenames:
parser = request.getfixturevalue("all_parsers")
elif "all_parsers_all_precisions" in request.fixturenames:
# Return value is tuple of (engine, precision)
parser = request.getfixturevalue("all_parsers_all_precisions")[0]
else:
return
if parser.engine == "pyarrow":
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")

View File

@ -0,0 +1,334 @@
"""
Tests dtype specification during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import os
import numpy as np
import pytest
from pandas._libs import parsers as libparsers
from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Timestamp,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
@pytest.mark.parametrize(
"dtype",
[
"category",
CategoricalDtype(),
{"a": "category", "b": "category", "c": CategoricalDtype()},
],
)
def test_categorical_dtype(all_parsers, dtype):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,a,3.4
1,a,3.4
2,b,4.5"""
expected = DataFrame(
{
"a": Categorical(["1", "1", "2"]),
"b": Categorical(["a", "a", "b"]),
"c": Categorical(["3.4", "3.4", "4.5"]),
}
)
actual = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(actual, expected)
@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
def test_categorical_dtype_single(all_parsers, dtype, request):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,a,3.4
1,a,3.4
2,b,4.5"""
expected = DataFrame(
{"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]}
)
if parser.engine == "pyarrow":
mark = pytest.mark.xfail(
strict=False,
reason="Flaky test sometimes gives object dtype instead of Categorical",
)
request.applymarker(mark)
actual = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(actual, expected)
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
def test_categorical_dtype_unsorted(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,b,3.4
1,b,3.4
2,a,4.5"""
expected = DataFrame(
{
"a": Categorical(["1", "1", "2"]),
"b": Categorical(["b", "b", "a"]),
"c": Categorical(["3.4", "3.4", "4.5"]),
}
)
actual = parser.read_csv(StringIO(data), dtype="category")
tm.assert_frame_equal(actual, expected)
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
def test_categorical_dtype_missing(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,b,3.4
1,nan,3.4
2,a,4.5"""
expected = DataFrame(
{
"a": Categorical(["1", "1", "2"]),
"b": Categorical(["b", np.nan, "a"]),
"c": Categorical(["3.4", "3.4", "4.5"]),
}
)
actual = parser.read_csv(StringIO(data), dtype="category")
tm.assert_frame_equal(actual, expected)
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
@pytest.mark.slow
def test_categorical_dtype_high_cardinality_numeric(all_parsers, monkeypatch):
# see gh-18186
# was an issue with C parser, due to DEFAULT_BUFFER_HEURISTIC
parser = all_parsers
heuristic = 2**5
data = np.sort([str(i) for i in range(heuristic + 1)])
expected = DataFrame({"a": Categorical(data, ordered=True)})
with monkeypatch.context() as m:
m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic)
actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
actual["a"] = actual["a"].cat.reorder_categories(
np.sort(actual.a.cat.categories), ordered=True
)
tm.assert_frame_equal(actual, expected)
def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
# see gh-10153
pth = os.path.join(csv_dir_path, "utf16_ex.txt")
parser = all_parsers
encoding = "utf-16"
sep = "\t"
expected = parser.read_csv(pth, sep=sep, encoding=encoding)
expected = expected.apply(Categorical)
actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category")
tm.assert_frame_equal(actual, expected)
def test_categorical_dtype_chunksize_infer_categories(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
expecteds = [
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}),
DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]),
]
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dtype={"b": "category"}, chunksize=2)
return
with parser.read_csv(
StringIO(data), dtype={"b": "category"}, chunksize=2
) as actuals:
for actual, expected in zip(actuals, expecteds):
tm.assert_frame_equal(actual, expected)
def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
cats = ["a", "b", "c"]
expecteds = [
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}),
DataFrame(
{"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)},
index=[2, 3],
),
]
dtype = CategoricalDtype(cats)
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2)
return
with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals:
for actual, expected in zip(actuals, expecteds):
tm.assert_frame_equal(actual, expected)
def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
# see gh-10153
pth = os.path.join(csv_dir_path, "unicode_series.csv")
parser = all_parsers
encoding = "latin-1"
expected = parser.read_csv(pth, header=None, encoding=encoding)
expected[1] = Categorical(expected[1])
actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"})
tm.assert_frame_equal(actual, expected)
@pytest.mark.parametrize("ordered", [False, True])
@pytest.mark.parametrize(
"categories",
[["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]],
)
def test_categorical_category_dtype(all_parsers, categories, ordered):
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
expected = DataFrame(
{
"a": [1, 1, 1, 2],
"b": Categorical(
["a", "b", "b", "c"], categories=categories, ordered=ordered
),
}
)
dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)}
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_category_dtype_unsorted(all_parsers):
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
dtype = CategoricalDtype(["c", "b", "a"])
expected = DataFrame(
{
"a": [1, 1, 1, 2],
"b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]),
}
)
result = parser.read_csv(StringIO(data), dtype={"b": dtype})
tm.assert_frame_equal(result, expected)
def test_categorical_coerces_numeric(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype([1, 2, 3])}
data = "b\n1\n1\n2\n3"
expected = DataFrame({"b": Categorical([1, 1, 2, 3])})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_coerces_datetime(all_parsers):
parser = all_parsers
dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None)
dtype = {"b": CategoricalDtype(dti)}
data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_coerces_timestamp(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype([Timestamp("2014")])}
data = "b\n2014-01-01\n2014-01-01"
expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_coerces_timedelta(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype(pd.to_timedelta(["1h", "2h", "3h"]))}
data = "b\n1h\n2h\n3h"
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data",
[
"b\nTrue\nFalse\nNA\nFalse",
"b\ntrue\nfalse\nNA\nfalse",
"b\nTRUE\nFALSE\nNA\nFALSE",
"b\nTrue\nFalse\nNA\nFALSE",
],
)
def test_categorical_dtype_coerces_boolean(all_parsers, data):
# see gh-20498
parser = all_parsers
dtype = {"b": CategoricalDtype([False, True])}
expected = DataFrame({"b": Categorical([True, False, None, False])})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_unexpected_categories(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}
data = "b\nd\na\nc\nd" # Unexpected c
expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,643 @@
"""
Tests dtype specification during parsing
for all of the parsers defined in parsers.py
"""
from collections import defaultdict
from io import StringIO
import numpy as np
import pytest
from pandas.errors import ParserWarning
import pandas as pd
from pandas import (
DataFrame,
Timestamp,
)
import pandas._testing as tm
from pandas.core.arrays import (
ArrowStringArray,
IntegerArray,
StringArray,
)
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.mark.parametrize("dtype", [str, object])
@pytest.mark.parametrize("check_orig", [True, False])
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_all_columns(all_parsers, dtype, check_orig):
# see gh-3795, gh-6607
parser = all_parsers
df = DataFrame(
np.random.default_rng(2).random((5, 2)).round(4),
columns=list("AB"),
index=["1A", "1B", "1C", "1D", "1E"],
)
with tm.ensure_clean("__passing_str_as_dtype__.csv") as path:
df.to_csv(path)
result = parser.read_csv(path, dtype=dtype, index_col=0)
if check_orig:
expected = df.copy()
result = result.astype(float)
else:
expected = df.astype(str)
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_per_column(all_parsers):
parser = all_parsers
data = """\
one,two
1,2.5
2,3.5
3,4.5
4,5.5"""
expected = DataFrame(
[[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
)
expected["one"] = expected["one"].astype(np.float64)
expected["two"] = expected["two"].astype(object)
result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
tm.assert_frame_equal(result, expected)
def test_invalid_dtype_per_column(all_parsers):
parser = all_parsers
data = """\
one,two
1,2.5
2,3.5
3,4.5
4,5.5"""
with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"):
parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})
def test_raise_on_passed_int_dtype_with_nas(all_parsers):
# see gh-2631
parser = all_parsers
data = """YEAR, DOY, a
2001,106380451,10
2001,,11
2001,106380451,67"""
if parser.engine == "c":
msg = "Integer column has NA values"
elif parser.engine == "pyarrow":
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
else:
msg = "Unable to convert column DOY"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True)
def test_dtype_with_converters(all_parsers):
parser = all_parsers
data = """a,b
1.1,2.2
1.2,2.3"""
if parser.engine == "pyarrow":
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)}
)
return
# Dtype spec ignored if converted specified.
result = parser.read_csv_check_warnings(
ParserWarning,
"Both a converter and dtype were specified for column a "
"- only the converter will be used.",
StringIO(data),
dtype={"a": "i8"},
converters={"a": lambda x: str(x)},
)
expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"])
)
def test_numeric_dtype(all_parsers, dtype):
data = "0\n1"
parser = all_parsers
expected = DataFrame([0, 1], dtype=dtype)
result = parser.read_csv(StringIO(data), header=None, dtype=dtype)
tm.assert_frame_equal(expected, result)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_boolean_dtype(all_parsers):
parser = all_parsers
data = "\n".join(
[
"a",
"True",
"TRUE",
"true",
"1",
"1.0",
"False",
"FALSE",
"false",
"0",
"0.0",
"NaN",
"nan",
"NA",
"null",
"NULL",
]
)
result = parser.read_csv(StringIO(data), dtype="boolean")
expected = DataFrame(
{
"a": pd.array(
[
True,
True,
True,
True,
True,
False,
False,
False,
False,
False,
None,
None,
None,
None,
None,
],
dtype="boolean",
)
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_delimiter_with_usecols_and_parse_dates(all_parsers):
# GH#35873
result = all_parsers.read_csv(
StringIO('"dump","-9,1","-9,1",20101010'),
engine="python",
names=["col", "col1", "col2", "col3"],
usecols=["col1", "col2", "col3"],
parse_dates=["col3"],
decimal=",",
)
expected = DataFrame(
{"col1": [-9.1], "col2": [-9.1], "col3": [Timestamp("2010-10-10")]}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("thousands", ["_", None])
def test_decimal_and_exponential(
request, python_parser_only, numeric_decimal, thousands
):
# GH#31920
decimal_number_check(request, python_parser_only, numeric_decimal, thousands, None)
@pytest.mark.parametrize("thousands", ["_", None])
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
def test_1000_sep_decimal_float_precision(
request, c_parser_only, numeric_decimal, float_precision, thousands
):
# test decimal and thousand sep handling in across 'float_precision'
# parsers
decimal_number_check(
request, c_parser_only, numeric_decimal, thousands, float_precision
)
text, value = numeric_decimal
text = " " + text + " "
if isinstance(value, str): # the negative cases (parse as text)
value = " " + value + " "
decimal_number_check(
request, c_parser_only, (text, value), thousands, float_precision
)
def decimal_number_check(request, parser, numeric_decimal, thousands, float_precision):
# GH#31920
value = numeric_decimal[0]
if thousands is None and value in ("1_,", "1_234,56", "1_234,56e0"):
request.applymarker(
pytest.mark.xfail(reason=f"thousands={thousands} and sep is in {value}")
)
df = parser.read_csv(
StringIO(value),
float_precision=float_precision,
sep="|",
thousands=thousands,
decimal=",",
header=None,
)
val = df.iloc[0, 0]
assert val == numeric_decimal[1]
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
def test_skip_whitespace(c_parser_only, float_precision):
DATA = """id\tnum\t
1\t1.2 \t
1\t 2.1\t
2\t 1\t
2\t 1.2 \t
"""
df = c_parser_only.read_csv(
StringIO(DATA),
float_precision=float_precision,
sep="\t",
header=0,
dtype={1: np.float64},
)
tm.assert_series_equal(df.iloc[:, 1], pd.Series([1.2, 2.1, 1.0, 1.2], name="num"))
@pytest.mark.usefixtures("pyarrow_xfail")
def test_true_values_cast_to_bool(all_parsers):
# GH#34655
text = """a,b
yes,xxx
no,yyy
1,zzz
0,aaa
"""
parser = all_parsers
result = parser.read_csv(
StringIO(text),
true_values=["yes"],
false_values=["no"],
dtype={"a": "boolean"},
)
expected = DataFrame(
{"a": [True, False, True, False], "b": ["xxx", "yyy", "zzz", "aaa"]}
)
expected["a"] = expected["a"].astype("boolean")
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
@pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)])
def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
# GH#35211
parser = all_parsers
data = """a,a\n1,1"""
dtype_dict = {"a": str, **dtypes}
# GH#42462
dtype_dict_copy = dtype_dict.copy()
result = parser.read_csv(StringIO(data), dtype=dtype_dict)
expected = DataFrame({"a": ["1"], "a.1": [exp_value]})
assert dtype_dict == dtype_dict_copy, "dtype dict changed"
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_mangle_dup_cols_single_dtype(all_parsers):
# GH#42022
parser = all_parsers
data = """a,a\n1,1"""
result = parser.read_csv(StringIO(data), dtype=str)
expected = DataFrame({"a": ["1"], "a.1": ["1"]})
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_multi_index(all_parsers):
# GH 42446
parser = all_parsers
data = "A,B,B\nX,Y,Z\n1,2,3"
result = parser.read_csv(
StringIO(data),
header=list(range(2)),
dtype={
("A", "X"): np.int32,
("B", "Y"): np.int32,
("B", "Z"): np.float32,
},
)
expected = DataFrame(
{
("A", "X"): np.int32([1]),
("B", "Y"): np.int32([2]),
("B", "Z"): np.float32([3]),
}
)
tm.assert_frame_equal(result, expected)
def test_nullable_int_dtype(all_parsers, any_int_ea_dtype):
# GH 25472
parser = all_parsers
dtype = any_int_ea_dtype
data = """a,b,c
,3,5
1,,6
2,4,"""
expected = DataFrame(
{
"a": pd.array([pd.NA, 1, 2], dtype=dtype),
"b": pd.array([3, pd.NA, 4], dtype=dtype),
"c": pd.array([5, 6, pd.NA], dtype=dtype),
}
)
actual = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(actual, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
@pytest.mark.parametrize("default", ["float", "float64"])
def test_dtypes_defaultdict(all_parsers, default):
# GH#41574
data = """a,b
1,2
"""
dtype = defaultdict(lambda: default, a="int64")
parser = all_parsers
result = parser.read_csv(StringIO(data), dtype=dtype)
expected = DataFrame({"a": [1], "b": 2.0})
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtypes_defaultdict_mangle_dup_cols(all_parsers):
# GH#41574
data = """a,b,a,b,b.1
1,2,3,4,5
"""
dtype = defaultdict(lambda: "float64", a="int64")
dtype["b.1"] = "int64"
parser = all_parsers
result = parser.read_csv(StringIO(data), dtype=dtype)
expected = DataFrame({"a": [1], "b": [2.0], "a.1": [3], "b.2": [4.0], "b.1": [5]})
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtypes_defaultdict_invalid(all_parsers):
# GH#41574
data = """a,b
1,2
"""
dtype = defaultdict(lambda: "invalid_dtype", a="int64")
parser = all_parsers
with pytest.raises(TypeError, match="not understood"):
parser.read_csv(StringIO(data), dtype=dtype)
def test_dtype_backend(all_parsers):
# GH#36712
parser = all_parsers
data = """a,b,c,d,e,f,g,h,i,j
1,2.5,True,a,,,,,12-31-2019,
3,4.5,False,b,6,7.5,True,a,12-31-2019,
"""
result = parser.read_csv(
StringIO(data), dtype_backend="numpy_nullable", parse_dates=["i"]
)
expected = DataFrame(
{
"a": pd.Series([1, 3], dtype="Int64"),
"b": pd.Series([2.5, 4.5], dtype="Float64"),
"c": pd.Series([True, False], dtype="boolean"),
"d": pd.Series(["a", "b"], dtype="string"),
"e": pd.Series([pd.NA, 6], dtype="Int64"),
"f": pd.Series([pd.NA, 7.5], dtype="Float64"),
"g": pd.Series([pd.NA, True], dtype="boolean"),
"h": pd.Series([pd.NA, "a"], dtype="string"),
"i": pd.Series([Timestamp("2019-12-31")] * 2),
"j": pd.Series([pd.NA, pd.NA], dtype="Int64"),
}
)
tm.assert_frame_equal(result, expected)
def test_dtype_backend_and_dtype(all_parsers):
# GH#36712
parser = all_parsers
data = """a,b
1,2.5
,
"""
result = parser.read_csv(
StringIO(data), dtype_backend="numpy_nullable", dtype="float64"
)
expected = DataFrame({"a": [1.0, np.nan], "b": [2.5, np.nan]})
tm.assert_frame_equal(result, expected)
def test_dtype_backend_string(all_parsers, string_storage):
# GH#36712
pa = pytest.importorskip("pyarrow")
with pd.option_context("mode.string_storage", string_storage):
parser = all_parsers
data = """a,b
a,x
b,
"""
result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable")
if string_storage == "python":
expected = DataFrame(
{
"a": StringArray(np.array(["a", "b"], dtype=np.object_)),
"b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
}
)
else:
expected = DataFrame(
{
"a": ArrowStringArray(pa.array(["a", "b"])),
"b": ArrowStringArray(pa.array(["x", None])),
}
)
tm.assert_frame_equal(result, expected)
def test_dtype_backend_ea_dtype_specified(all_parsers):
# GH#491496
data = """a,b
1,2
"""
parser = all_parsers
result = parser.read_csv(
StringIO(data), dtype="Int64", dtype_backend="numpy_nullable"
)
expected = DataFrame({"a": [1], "b": 2}, dtype="Int64")
tm.assert_frame_equal(result, expected)
def test_dtype_backend_pyarrow(all_parsers, request):
# GH#36712
pa = pytest.importorskip("pyarrow")
parser = all_parsers
data = """a,b,c,d,e,f,g,h,i,j
1,2.5,True,a,,,,,12-31-2019,
3,4.5,False,b,6,7.5,True,a,12-31-2019,
"""
result = parser.read_csv(StringIO(data), dtype_backend="pyarrow", parse_dates=["i"])
expected = DataFrame(
{
"a": pd.Series([1, 3], dtype="int64[pyarrow]"),
"b": pd.Series([2.5, 4.5], dtype="float64[pyarrow]"),
"c": pd.Series([True, False], dtype="bool[pyarrow]"),
"d": pd.Series(["a", "b"], dtype=pd.ArrowDtype(pa.string())),
"e": pd.Series([pd.NA, 6], dtype="int64[pyarrow]"),
"f": pd.Series([pd.NA, 7.5], dtype="float64[pyarrow]"),
"g": pd.Series([pd.NA, True], dtype="bool[pyarrow]"),
"h": pd.Series(
[pd.NA, "a"],
dtype=pd.ArrowDtype(pa.string()),
),
"i": pd.Series([Timestamp("2019-12-31")] * 2),
"j": pd.Series([pd.NA, pd.NA], dtype="null[pyarrow]"),
}
)
tm.assert_frame_equal(result, expected)
# pyarrow engine failing:
# https://github.com/pandas-dev/pandas/issues/56136
@pytest.mark.usefixtures("pyarrow_xfail")
def test_ea_int_avoid_overflow(all_parsers):
# GH#32134
parser = all_parsers
data = """a,b
1,1
,1
1582218195625938945,1
"""
result = parser.read_csv(StringIO(data), dtype={"a": "Int64"})
expected = DataFrame(
{
"a": IntegerArray(
np.array([1, 1, 1582218195625938945]), np.array([False, True, False])
),
"b": 1,
}
)
tm.assert_frame_equal(result, expected)
def test_string_inference(all_parsers):
# GH#54430
pytest.importorskip("pyarrow")
dtype = "string[pyarrow_numpy]"
data = """a,b
x,1
y,2
,3"""
parser = all_parsers
with pd.option_context("future.infer_string", True):
result = parser.read_csv(StringIO(data))
expected = DataFrame(
{"a": pd.Series(["x", "y", None], dtype=dtype), "b": [1, 2, 3]},
columns=pd.Index(["a", "b"], dtype=dtype),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_])
def test_string_inference_object_dtype(all_parsers, dtype):
# GH#56047
pytest.importorskip("pyarrow")
data = """a,b
x,a
y,a
z,a"""
parser = all_parsers
with pd.option_context("future.infer_string", True):
result = parser.read_csv(StringIO(data), dtype=dtype)
expected = DataFrame(
{
"a": pd.Series(["x", "y", "z"], dtype=object),
"b": pd.Series(["a", "a", "a"], dtype=object),
},
columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
)
tm.assert_frame_equal(result, expected)
with pd.option_context("future.infer_string", True):
result = parser.read_csv(StringIO(data), dtype={"a": dtype})
expected = DataFrame(
{
"a": pd.Series(["x", "y", "z"], dtype=object),
"b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"),
},
columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
)
tm.assert_frame_equal(result, expected)
def test_accurate_parsing_of_large_integers(all_parsers):
# GH#52505
data = """SYMBOL,MOMENT,ID,ID_DEAL
AAPL,20230301181139587,1925036343869802844,
AAPL,20230301181139587,2023552585717889863,2023552585717263358
NVDA,20230301181139587,2023552585717889863,2023552585717263359
AMC,20230301181139587,2023552585717889863,2023552585717263360
AMZN,20230301181139587,2023552585717889759,2023552585717263360
MSFT,20230301181139587,2023552585717889863,2023552585717263361
NVDA,20230301181139587,2023552585717889827,2023552585717263361"""
orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()})
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263361, "ID_DEAL"]) == 2
def test_dtypes_with_usecols(all_parsers):
# GH#54868
parser = all_parsers
data = """a,b,c
1,2,3
4,5,6"""
result = parser.read_csv(StringIO(data), usecols=["a", "c"], dtype={"a": object})
if parser.engine == "pyarrow":
values = [1, 4]
else:
values = ["1", "4"]
expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]})
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,181 @@
"""
Tests dtype specification during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas import (
Categorical,
DataFrame,
Index,
MultiIndex,
Series,
concat,
)
import pandas._testing as tm
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_dtype_all_columns_empty(all_parsers):
# see gh-12048
parser = all_parsers
result = parser.read_csv(StringIO("A,B"), dtype=str)
expected = DataFrame({"A": [], "B": []}, dtype=str)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_empty_pass_dtype(all_parsers):
parser = all_parsers
data = "one,two"
result = parser.read_csv(StringIO(data), dtype={"one": "u1"})
expected = DataFrame(
{"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=object)},
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_empty_with_index_pass_dtype(all_parsers):
parser = all_parsers
data = "one,two"
result = parser.read_csv(
StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"}
)
expected = DataFrame(
{"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one")
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_empty_with_multi_index_pass_dtype(all_parsers):
parser = all_parsers
data = "one,two,three"
result = parser.read_csv(
StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"}
)
exp_idx = MultiIndex.from_arrays(
[np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)],
names=["one", "two"],
)
expected = DataFrame({"three": np.empty(0, dtype=object)}, index=exp_idx)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers):
parser = all_parsers
data = "one,one"
result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"})
expected = DataFrame(
{"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers):
parser = all_parsers
data = "one,one"
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
expected = DataFrame(
{"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers):
# see gh-9424
parser = all_parsers
expected = concat(
[Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
axis=1,
)
data = "one,one"
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
tm.assert_frame_equal(result, expected)
def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers):
# see gh-9424
parser = all_parsers
expected = concat(
[Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
axis=1,
)
expected.index = expected.index.astype(object)
with pytest.raises(ValueError, match="Duplicate names"):
data = ""
parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"})
@pytest.mark.parametrize(
"dtype,expected",
[
(np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)),
(
"category",
DataFrame({"a": Categorical([]), "b": Categorical([])}),
),
(
{"a": "category", "b": "category"},
DataFrame({"a": Categorical([]), "b": Categorical([])}),
),
("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")),
(
"timedelta64[ns]",
DataFrame(
{
"a": Series([], dtype="timedelta64[ns]"),
"b": Series([], dtype="timedelta64[ns]"),
},
),
),
(
{"a": np.int64, "b": np.int32},
DataFrame(
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
),
),
(
{0: np.int64, 1: np.int32},
DataFrame(
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
),
),
(
{"a": np.int64, 1: np.int32},
DataFrame(
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
),
),
],
)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_empty_dtype(all_parsers, dtype, expected):
# see gh-14712
parser = all_parsers
data = "a,b"
result = parser.read_csv(StringIO(data), header=0, dtype=dtype)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,643 @@
"""
Tests that apply specifically to the CParser. Unless specifically stated
as a CParser-specific issue, the goal is to eventually move as many of
these tests out of this module as soon as the Python parser can accept
further arguments when parsing.
"""
from decimal import Decimal
from io import (
BytesIO,
StringIO,
TextIOWrapper,
)
import mmap
import os
import tarfile
import numpy as np
import pytest
from pandas.compat.numpy import np_version_gte1p24
from pandas.errors import (
ParserError,
ParserWarning,
)
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
concat,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"malformed",
["1\r1\r1\r 1\r 1\r", "1\r1\r1\r 1\r 1\r11\r", "1\r1\r1\r 1\r 1\r11\r1\r"],
ids=["words pointer", "stream pointer", "lines pointer"],
)
def test_buffer_overflow(c_parser_only, malformed):
# see gh-9205: test certain malformed input files that cause
# buffer overflows in tokenizer.c
msg = "Buffer overflow caught - possible malformed input file."
parser = c_parser_only
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(malformed))
def test_delim_whitespace_custom_terminator(c_parser_only):
# See gh-12912
data = "a b c~1 2 3~4 5 6~7 8 9"
parser = c_parser_only
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True)
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])
tm.assert_frame_equal(df, expected)
def test_dtype_and_names_error(c_parser_only):
# see gh-8833: passing both dtype and names
# resulting in an error reporting issue
parser = c_parser_only
data = """
1.0 1
2.0 2
3.0 3
"""
# base cases
result = parser.read_csv(StringIO(data), sep=r"\s+", header=None)
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]])
tm.assert_frame_equal(result, expected)
result = parser.read_csv(StringIO(data), sep=r"\s+", header=None, names=["a", "b"])
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
# fallback casting
result = parser.read_csv(
StringIO(data), sep=r"\s+", header=None, names=["a", "b"], dtype={"a": np.int32}
)
expected = DataFrame([[1, 1], [2, 2], [3, 3]], columns=["a", "b"])
expected["a"] = expected["a"].astype(np.int32)
tm.assert_frame_equal(result, expected)
data = """
1.0 1
nan 2
3.0 3
"""
# fallback casting, but not castable
warning = RuntimeWarning if np_version_gte1p24 else None
with pytest.raises(ValueError, match="cannot safely convert"):
with tm.assert_produces_warning(warning, check_stacklevel=False):
parser.read_csv(
StringIO(data),
sep=r"\s+",
header=None,
names=["a", "b"],
dtype={"a": np.int32},
)
@pytest.mark.parametrize(
"match,kwargs",
[
# For each of these cases, all of the dtypes are valid, just unsupported.
(
(
"the dtype datetime64 is not supported for parsing, "
"pass this column using parse_dates instead"
),
{"dtype": {"A": "datetime64", "B": "float64"}},
),
(
(
"the dtype datetime64 is not supported for parsing, "
"pass this column using parse_dates instead"
),
{"dtype": {"A": "datetime64", "B": "float64"}, "parse_dates": ["B"]},
),
(
"the dtype timedelta64 is not supported for parsing",
{"dtype": {"A": "timedelta64", "B": "float64"}},
),
(
f"the dtype {tm.ENDIAN}U8 is not supported for parsing",
{"dtype": {"A": "U8"}},
),
],
ids=["dt64-0", "dt64-1", "td64", f"{tm.ENDIAN}U8"],
)
def test_unsupported_dtype(c_parser_only, match, kwargs):
parser = c_parser_only
df = DataFrame(
np.random.default_rng(2).random((5, 2)),
columns=list("AB"),
index=["1A", "1B", "1C", "1D", "1E"],
)
with tm.ensure_clean("__unsupported_dtype__.csv") as path:
df.to_csv(path)
with pytest.raises(TypeError, match=match):
parser.read_csv(path, index_col=0, **kwargs)
@td.skip_if_32bit
@pytest.mark.slow
# test numbers between 1 and 2
@pytest.mark.parametrize("num", np.linspace(1.0, 2.0, num=21))
def test_precise_conversion(c_parser_only, num):
parser = c_parser_only
normal_errors = []
precise_errors = []
def error(val: float, actual_val: Decimal) -> Decimal:
return abs(Decimal(f"{val:.100}") - actual_val)
# 25 decimal digits of precision
text = f"a\n{num:.25}"
normal_val = float(
parser.read_csv(StringIO(text), float_precision="legacy")["a"][0]
)
precise_val = float(parser.read_csv(StringIO(text), float_precision="high")["a"][0])
roundtrip_val = float(
parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0]
)
actual_val = Decimal(text[2:])
normal_errors.append(error(normal_val, actual_val))
precise_errors.append(error(precise_val, actual_val))
# round-trip should match float()
assert roundtrip_val == float(text[2:])
assert sum(precise_errors) <= sum(normal_errors)
assert max(precise_errors) <= max(normal_errors)
def test_usecols_dtypes(c_parser_only):
parser = c_parser_only
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
result = parser.read_csv(
StringIO(data),
usecols=(0, 1, 2),
names=("a", "b", "c"),
header=None,
converters={"a": str},
dtype={"b": int, "c": float},
)
result2 = parser.read_csv(
StringIO(data),
usecols=(0, 2),
names=("a", "b", "c"),
header=None,
converters={"a": str},
dtype={"b": int, "c": float},
)
assert (result.dtypes == [object, int, float]).all()
assert (result2.dtypes == [object, float]).all()
def test_disable_bool_parsing(c_parser_only):
# see gh-2090
parser = c_parser_only
data = """A,B,C
Yes,No,Yes
No,Yes,Yes
Yes,,Yes
No,No,No"""
result = parser.read_csv(StringIO(data), dtype=object)
assert (result.dtypes == object).all()
result = parser.read_csv(StringIO(data), dtype=object, na_filter=False)
assert result["B"][2] == ""
def test_custom_lineterminator(c_parser_only):
parser = c_parser_only
data = "a,b,c~1,2,3~4,5,6"
result = parser.read_csv(StringIO(data), lineterminator="~")
expected = parser.read_csv(StringIO(data.replace("~", "\n")))
tm.assert_frame_equal(result, expected)
def test_parse_ragged_csv(c_parser_only):
parser = c_parser_only
data = """1,2,3
1,2,3,4
1,2,3,4,5
1,2
1,2,3,4"""
nice_data = """1,2,3,,
1,2,3,4,
1,2,3,4,5
1,2,,,
1,2,3,4,"""
result = parser.read_csv(
StringIO(data), header=None, names=["a", "b", "c", "d", "e"]
)
expected = parser.read_csv(
StringIO(nice_data), header=None, names=["a", "b", "c", "d", "e"]
)
tm.assert_frame_equal(result, expected)
# too many columns, cause segfault if not careful
data = "1,2\n3,4,5"
result = parser.read_csv(StringIO(data), header=None, names=range(50))
expected = parser.read_csv(StringIO(data), header=None, names=range(3)).reindex(
columns=range(50)
)
tm.assert_frame_equal(result, expected)
def test_tokenize_CR_with_quoting(c_parser_only):
# see gh-3453
parser = c_parser_only
data = ' a,b,c\r"a,b","e,d","f,f"'
result = parser.read_csv(StringIO(data), header=None)
expected = parser.read_csv(StringIO(data.replace("\r", "\n")), header=None)
tm.assert_frame_equal(result, expected)
result = parser.read_csv(StringIO(data))
expected = parser.read_csv(StringIO(data.replace("\r", "\n")))
tm.assert_frame_equal(result, expected)
@pytest.mark.slow
@pytest.mark.parametrize("count", [3 * 2**n for n in range(6)])
def test_grow_boundary_at_cap(c_parser_only, count):
# See gh-12494
#
# Cause of error was that the C parser
# was not increasing the buffer size when
# the desired space would fill the buffer
# to capacity, which would later cause a
# buffer overflow error when checking the
# EOF terminator of the CSV stream.
# 3 * 2^n commas was observed to break the parser
parser = c_parser_only
with StringIO("," * count) as s:
expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)])
df = parser.read_csv(s)
tm.assert_frame_equal(df, expected)
@pytest.mark.slow
@pytest.mark.parametrize("encoding", [None, "utf-8"])
def test_parse_trim_buffers(c_parser_only, encoding):
# This test is part of a bugfix for gh-13703. It attempts to
# to stress the system memory allocator, to cause it to move the
# stream buffer and either let the OS reclaim the region, or let
# other memory requests of parser otherwise modify the contents
# of memory space, where it was formally located.
# This test is designed to cause a `segfault` with unpatched
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
# times it fails due to memory corruption, which causes the
# loaded DataFrame to differ from the expected one.
# Also force 'utf-8' encoding, so that `_string_convert` would take
# a different execution branch.
parser = c_parser_only
# Generate a large mixed-type CSV file on-the-fly (one record is
# approx 1.5KiB).
record_ = (
"""9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z"""
"""ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,"""
"""ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9"""
"""99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,"""
"""9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9."""
"""99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999."""
"""99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ"""
"""ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ"""
"""ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z"""
"""ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,"""
"""9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,"""
"""999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,"""
""",,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999"""
""",9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9."""
"""999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,"""
""",9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z"""
"""ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ"""
""",999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99"""
""",,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-"""
"""9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9"""
""".99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,"""
""",,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9."""
"""99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ"""
"""ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ"""
"""-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ"""
"""ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ"""
""",9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99"""
""",99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9"""
""".99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
)
# Set the number of lines so that a call to `parser_trim_buffers`
# is triggered: after a couple of full chunks are consumed a
# relatively small 'residual' chunk would cause reallocation
# within the parser.
chunksize, n_lines = 128, 2 * 128 + 15
csv_data = "\n".join([record_] * n_lines) + "\n"
# We will use StringIO to load the CSV from this text buffer.
# pd.read_csv() will iterate over the file in chunks and will
# finally read a residual chunk of really small size.
# Generate the expected output: manually create the dataframe
# by splitting by comma and repeating the `n_lines` times.
row = tuple(val_ if val_ else np.nan for val_ in record_.split(","))
expected = DataFrame(
[row for _ in range(n_lines)], dtype=object, columns=None, index=None
)
# Iterate over the CSV file in chunks of `chunksize` lines
with parser.read_csv(
StringIO(csv_data),
header=None,
dtype=object,
chunksize=chunksize,
encoding=encoding,
) as chunks_:
result = concat(chunks_, axis=0, ignore_index=True)
# Check for data corruption if there was no segfault
tm.assert_frame_equal(result, expected)
def test_internal_null_byte(c_parser_only):
# see gh-14012
#
# The null byte ('\x00') should not be used as a
# true line terminator, escape character, or comment
# character, only as a placeholder to indicate that
# none was specified.
#
# This test should be moved to test_common.py ONLY when
# Python's csv class supports parsing '\x00'.
parser = c_parser_only
names = ["a", "b", "c"]
data = "1,2,3\n4,\x00,6\n7,8,9"
expected = DataFrame([[1, 2.0, 3], [4, np.nan, 6], [7, 8, 9]], columns=names)
result = parser.read_csv(StringIO(data), names=names)
tm.assert_frame_equal(result, expected)
def test_read_nrows_large(c_parser_only):
# gh-7626 - Read only nrows of data in for large inputs (>262144b)
parser = c_parser_only
header_narrow = "\t".join(["COL_HEADER_" + str(i) for i in range(10)]) + "\n"
data_narrow = "\t".join(["somedatasomedatasomedata1" for _ in range(10)]) + "\n"
header_wide = "\t".join(["COL_HEADER_" + str(i) for i in range(15)]) + "\n"
data_wide = "\t".join(["somedatasomedatasomedata2" for _ in range(15)]) + "\n"
test_input = header_narrow + data_narrow * 1050 + header_wide + data_wide * 2
df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010)
assert df.size == 1010 * 10
def test_float_precision_round_trip_with_text(c_parser_only):
# see gh-15140
parser = c_parser_only
df = parser.read_csv(StringIO("a"), header=None, float_precision="round_trip")
tm.assert_frame_equal(df, DataFrame({0: ["a"]}))
def test_large_difference_in_columns(c_parser_only):
# see gh-14125
parser = c_parser_only
count = 10000
large_row = ("X," * count)[:-1] + "\n"
normal_row = "XXXXXX XXXXXX,111111111111111\n"
test_input = (large_row + normal_row * 6)[:-1]
result = parser.read_csv(StringIO(test_input), header=None, usecols=[0])
rows = test_input.split("\n")
expected = DataFrame([row.split(",")[0] for row in rows])
tm.assert_frame_equal(result, expected)
def test_data_after_quote(c_parser_only):
# see gh-15910
parser = c_parser_only
data = 'a\n1\n"b"a'
result = parser.read_csv(StringIO(data))
expected = DataFrame({"a": ["1", "ba"]})
tm.assert_frame_equal(result, expected)
def test_comment_whitespace_delimited(c_parser_only):
parser = c_parser_only
test_input = """\
1 2
2 2 3
3 2 3 # 3 fields
4 2 3# 3 fields
5 2 # 2 fields
6 2# 2 fields
7 # 1 field, NaN
8# 1 field, NaN
9 2 3 # skipped line
# comment"""
with tm.assert_produces_warning(
ParserWarning, match="Skipping line", check_stacklevel=False
):
df = parser.read_csv(
StringIO(test_input),
comment="#",
header=None,
delimiter="\\s+",
skiprows=0,
on_bad_lines="warn",
)
expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]])
tm.assert_frame_equal(df, expected)
def test_file_like_no_next(c_parser_only):
# gh-16530: the file-like need not have a "next" or "__next__"
# attribute despite having an "__iter__" attribute.
#
# NOTE: This is only true for the C engine, not Python engine.
class NoNextBuffer(StringIO):
def __next__(self):
raise AttributeError("No next method")
next = __next__
parser = c_parser_only
data = "a\n1"
expected = DataFrame({"a": [1]})
result = parser.read_csv(NoNextBuffer(data))
tm.assert_frame_equal(result, expected)
def test_buffer_rd_bytes_bad_unicode(c_parser_only):
# see gh-22748
t = BytesIO(b"\xB0")
t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape")
msg = "'utf-8' codec can't encode character"
with pytest.raises(UnicodeError, match=msg):
c_parser_only.read_csv(t, encoding="UTF-8")
@pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"])
def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix):
# see gh-16530
#
# Unfortunately, Python's CSV library can't handle
# tarfile objects (expects string, not bytes when
# iterating through a file-like).
parser = c_parser_only
tar_path = os.path.join(csv_dir_path, "tar_csv" + tar_suffix)
with tarfile.open(tar_path, "r") as tar:
data_file = tar.extractfile("tar_data.csv")
out = parser.read_csv(data_file)
expected = DataFrame({"a": [1]})
tm.assert_frame_equal(out, expected)
def test_chunk_whitespace_on_boundary(c_parser_only):
# see gh-9735: this issue is C parser-specific (bug when
# parsing whitespace and characters at chunk boundary)
#
# This test case has a field too large for the Python parser / CSV library.
parser = c_parser_only
chunk1 = "a" * (1024 * 256 - 2) + "\na"
chunk2 = "\n a"
result = parser.read_csv(StringIO(chunk1 + chunk2), header=None)
expected = DataFrame(["a" * (1024 * 256 - 2), "a", " a"])
tm.assert_frame_equal(result, expected)
def test_file_handles_mmap(c_parser_only, csv1):
# gh-14418
#
# Don't close user provided file handles.
parser = c_parser_only
with open(csv1, encoding="utf-8") as f:
with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m:
parser.read_csv(m)
assert not m.closed
def test_file_binary_mode(c_parser_only):
# see gh-23779
parser = c_parser_only
expected = DataFrame([[1, 2, 3], [4, 5, 6]])
with tm.ensure_clean() as path:
with open(path, "w", encoding="utf-8") as f:
f.write("1,2,3\n4,5,6")
with open(path, "rb") as f:
result = parser.read_csv(f, header=None)
tm.assert_frame_equal(result, expected)
def test_unix_style_breaks(c_parser_only):
# GH 11020
parser = c_parser_only
with tm.ensure_clean() as path:
with open(path, "w", newline="\n", encoding="utf-8") as f:
f.write("blah\n\ncol_1,col_2,col_3\n\n")
result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c")
expected = DataFrame(columns=["col_1", "col_2", "col_3"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
@pytest.mark.parametrize(
"data,thousands,decimal",
[
(
"""A|B|C
1|2,334.01|5
10|13|10.
""",
",",
".",
),
(
"""A|B|C
1|2.334,01|5
10|13|10,
""",
".",
",",
),
],
)
def test_1000_sep_with_decimal(
c_parser_only, data, thousands, decimal, float_precision
):
parser = c_parser_only
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
result = parser.read_csv(
StringIO(data),
sep="|",
thousands=thousands,
decimal=decimal,
float_precision=float_precision,
)
tm.assert_frame_equal(result, expected)
def test_float_precision_options(c_parser_only):
# GH 17154, 36228
parser = c_parser_only
s = "foo\n243.164\n"
df = parser.read_csv(StringIO(s))
df2 = parser.read_csv(StringIO(s), float_precision="high")
tm.assert_frame_equal(df, df2)
df3 = parser.read_csv(StringIO(s), float_precision="legacy")
assert not df.iloc[0, 0] == df3.iloc[0, 0]
msg = "Unrecognized float_precision option: junk"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(s), float_precision="junk")

View File

@ -0,0 +1,227 @@
"""
Tests that comments are properly handled during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas import DataFrame
import pandas._testing as tm
@pytest.mark.parametrize("na_values", [None, ["NaN"]])
def test_comment(all_parsers, na_values):
parser = all_parsers
data = """A,B,C
1,2.,4.#hello world
5.,NaN,10.0
"""
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
if parser.engine == "pyarrow":
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), comment="#", na_values=na_values)
return
result = parser.read_csv(StringIO(data), comment="#", na_values=na_values)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"read_kwargs", [{}, {"lineterminator": "*"}, {"delim_whitespace": True}]
)
def test_line_comment(all_parsers, read_kwargs, request):
parser = all_parsers
data = """# empty
A,B,C
1,2.,4.#hello world
#ignore this line
5.,NaN,10.0
"""
warn = None
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
if read_kwargs.get("delim_whitespace"):
data = data.replace(",", " ")
warn = FutureWarning
elif read_kwargs.get("lineterminator"):
data = data.replace("\n", read_kwargs.get("lineterminator"))
read_kwargs["comment"] = "#"
if parser.engine == "pyarrow":
if "lineterminator" in read_kwargs:
msg = (
"The 'lineterminator' option is not supported with the 'pyarrow' engine"
)
else:
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
warn, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), **read_kwargs)
return
elif parser.engine == "python" and read_kwargs.get("lineterminator"):
msg = r"Custom line terminators not supported in python parser \(yet\)"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
warn, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), **read_kwargs)
return
with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False):
result = parser.read_csv(StringIO(data), **read_kwargs)
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
tm.assert_frame_equal(result, expected)
def test_comment_skiprows(all_parsers):
parser = all_parsers
data = """# empty
random line
# second empty line
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0
"""
# This should ignore the first four lines (including comments).
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
if parser.engine == "pyarrow":
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), comment="#", skiprows=4)
return
result = parser.read_csv(StringIO(data), comment="#", skiprows=4)
tm.assert_frame_equal(result, expected)
def test_comment_header(all_parsers):
parser = all_parsers
data = """# empty
# second empty line
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0
"""
# Header should begin at the second non-comment line.
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
if parser.engine == "pyarrow":
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), comment="#", header=1)
return
result = parser.read_csv(StringIO(data), comment="#", header=1)
tm.assert_frame_equal(result, expected)
def test_comment_skiprows_header(all_parsers):
parser = all_parsers
data = """# empty
# second empty line
# third empty line
X,Y,Z
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0
"""
# Skiprows should skip the first 4 lines (including comments),
# while header should start from the second non-commented line,
# starting with line 5.
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
if parser.engine == "pyarrow":
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
return
result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("comment_char", ["#", "~", "&", "^", "*", "@"])
def test_custom_comment_char(all_parsers, comment_char):
parser = all_parsers
data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
if parser.engine == "pyarrow":
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data.replace("#", comment_char)), comment=comment_char
)
return
result = parser.read_csv(
StringIO(data.replace("#", comment_char)), comment=comment_char
)
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("header", ["infer", None])
def test_comment_first_line(all_parsers, header):
# see gh-4623
parser = all_parsers
data = "# notes\na,b,c\n# more notes\n1,2,3"
if header is None:
expected = DataFrame({0: ["a", "1"], 1: ["b", "2"], 2: ["c", "3"]})
else:
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
if parser.engine == "pyarrow":
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), comment="#", header=header)
return
result = parser.read_csv(StringIO(data), comment="#", header=header)
tm.assert_frame_equal(result, expected)
def test_comment_char_in_default_value(all_parsers, request):
# GH#34002
if all_parsers.engine == "c":
reason = "see gh-34002: works on the python engine but not the c engine"
# NA value containing comment char is interpreted as comment
request.applymarker(pytest.mark.xfail(reason=reason, raises=AssertionError))
parser = all_parsers
data = (
"# this is a comment\n"
"col1,col2,col3,col4\n"
"1,2,3,4#inline comment\n"
"4,5#,6,10\n"
"7,8,#N/A,11\n"
)
if parser.engine == "pyarrow":
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), comment="#", na_values="#N/A")
return
result = parser.read_csv(StringIO(data), comment="#", na_values="#N/A")
expected = DataFrame(
{
"col1": [1, 4, 7],
"col2": [2, 5, 8],
"col3": [3.0, np.nan, np.nan],
"col4": [4.0, np.nan, 11.0],
}
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,211 @@
"""
Tests compressed data parsing functionality for all
of the parsers defined in parsers.py
"""
import os
from pathlib import Path
import tarfile
import zipfile
import pytest
from pandas import DataFrame
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.fixture(params=[True, False])
def buffer(request):
return request.param
@pytest.fixture
def parser_and_data(all_parsers, csv1):
parser = all_parsers
with open(csv1, "rb") as f:
data = f.read()
expected = parser.read_csv(csv1)
return parser, data, expected
@pytest.mark.parametrize("compression", ["zip", "infer", "zip2"])
def test_zip(parser_and_data, compression):
parser, data, expected = parser_and_data
with tm.ensure_clean("test_file.zip") as path:
with zipfile.ZipFile(path, mode="w") as tmp:
tmp.writestr("test_file", data)
if compression == "zip2":
with open(path, "rb") as f:
result = parser.read_csv(f, compression="zip")
else:
result = parser.read_csv(path, compression=compression)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("compression", ["zip", "infer"])
def test_zip_error_multiple_files(parser_and_data, compression):
parser, data, expected = parser_and_data
with tm.ensure_clean("combined_zip.zip") as path:
inner_file_names = ["test_file", "second_file"]
with zipfile.ZipFile(path, mode="w") as tmp:
for file_name in inner_file_names:
tmp.writestr(file_name, data)
with pytest.raises(ValueError, match="Multiple files"):
parser.read_csv(path, compression=compression)
def test_zip_error_no_files(parser_and_data):
parser, _, _ = parser_and_data
with tm.ensure_clean() as path:
with zipfile.ZipFile(path, mode="w"):
pass
with pytest.raises(ValueError, match="Zero files"):
parser.read_csv(path, compression="zip")
def test_zip_error_invalid_zip(parser_and_data):
parser, _, _ = parser_and_data
with tm.ensure_clean() as path:
with open(path, "rb") as f:
with pytest.raises(zipfile.BadZipFile, match="File is not a zip file"):
parser.read_csv(f, compression="zip")
@pytest.mark.parametrize("filename", [None, "test.{ext}"])
def test_compression(
request,
parser_and_data,
compression_only,
buffer,
filename,
compression_to_extension,
):
parser, data, expected = parser_and_data
compress_type = compression_only
ext = compression_to_extension[compress_type]
filename = filename if filename is None else filename.format(ext=ext)
if filename and buffer:
request.applymarker(
pytest.mark.xfail(
reason="Cannot deduce compression from buffer of compressed data."
)
)
with tm.ensure_clean(filename=filename) as path:
tm.write_to_compressed(compress_type, path, data)
compression = "infer" if filename else compress_type
if buffer:
with open(path, "rb") as f:
result = parser.read_csv(f, compression=compression)
else:
result = parser.read_csv(path, compression=compression)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("ext", [None, "gz", "bz2"])
def test_infer_compression(all_parsers, csv1, buffer, ext):
# see gh-9770
parser = all_parsers
kwargs = {"index_col": 0, "parse_dates": True}
expected = parser.read_csv(csv1, **kwargs)
kwargs["compression"] = "infer"
if buffer:
with open(csv1, encoding="utf-8") as f:
result = parser.read_csv(f, **kwargs)
else:
ext = "." + ext if ext else ""
result = parser.read_csv(csv1 + ext, **kwargs)
tm.assert_frame_equal(result, expected)
def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt):
# see gh-18071, gh-24130
parser = all_parsers
encoding = encoding_fmt.format(utf_value)
path = os.path.join(csv_dir_path, f"utf{utf_value}_ex_small.zip")
result = parser.read_csv(path, encoding=encoding, compression="zip", sep="\t")
expected = DataFrame(
{
"Country": ["Venezuela", "Venezuela"],
"Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."],
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"])
def test_invalid_compression(all_parsers, invalid_compression):
parser = all_parsers
compress_kwargs = {"compression": invalid_compression}
msg = f"Unrecognized compression type: {invalid_compression}"
with pytest.raises(ValueError, match=msg):
parser.read_csv("test_file.zip", **compress_kwargs)
def test_compression_tar_archive(all_parsers, csv_dir_path):
parser = all_parsers
path = os.path.join(csv_dir_path, "tar_csv.tar.gz")
df = parser.read_csv(path)
assert list(df.columns) == ["a"]
def test_ignore_compression_extension(all_parsers):
parser = all_parsers
df = DataFrame({"a": [0, 1]})
with tm.ensure_clean("test.csv") as path_csv:
with tm.ensure_clean("test.csv.zip") as path_zip:
# make sure to create un-compressed file with zip extension
df.to_csv(path_csv, index=False)
Path(path_zip).write_text(
Path(path_csv).read_text(encoding="utf-8"), encoding="utf-8"
)
tm.assert_frame_equal(parser.read_csv(path_zip, compression=None), df)
def test_writes_tar_gz(all_parsers):
parser = all_parsers
data = DataFrame(
{
"Country": ["Venezuela", "Venezuela"],
"Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."],
}
)
with tm.ensure_clean("test.tar.gz") as tar_path:
data.to_csv(tar_path, index=False)
# test that read_csv infers .tar.gz to gzip:
tm.assert_frame_equal(parser.read_csv(tar_path), data)
# test that file is indeed gzipped:
with tarfile.open(tar_path, "r:gz") as tar:
result = parser.read_csv(
tar.extractfile(tar.getnames()[0]), compression="infer"
)
tm.assert_frame_equal(result, data)

View File

@ -0,0 +1,36 @@
import numpy as np
import pytest
from pandas.errors import DtypeWarning
import pandas._testing as tm
from pandas.core.arrays import ArrowExtensionArray
from pandas.io.parsers.c_parser_wrapper import _concatenate_chunks
def test_concatenate_chunks_pyarrow():
# GH#51876
pa = pytest.importorskip("pyarrow")
chunks = [
{0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
{0: ArrowExtensionArray(pa.array([1, 2]))},
]
result = _concatenate_chunks(chunks)
expected = ArrowExtensionArray(pa.array([1.5, 2.5, 1.0, 2.0]))
tm.assert_extension_array_equal(result[0], expected)
def test_concatenate_chunks_pyarrow_strings():
# GH#51876
pa = pytest.importorskip("pyarrow")
chunks = [
{0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
{0: ArrowExtensionArray(pa.array(["a", "b"]))},
]
with tm.assert_produces_warning(DtypeWarning, match="have mixed types"):
result = _concatenate_chunks(chunks)
expected = np.concatenate(
[np.array([1.5, 2.5], dtype=object), np.array(["a", "b"])]
)
tm.assert_numpy_array_equal(result[0], expected)

View File

@ -0,0 +1,263 @@
"""
Tests column conversion functionality during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
from dateutil.parser import parse
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
def test_converters_type_must_be_dict(all_parsers):
parser = all_parsers
data = """index,A,B,C,D
foo,2,3,4,5
"""
if parser.engine == "pyarrow":
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), converters=0)
return
with pytest.raises(TypeError, match="Type converters.+"):
parser.read_csv(StringIO(data), converters=0)
@pytest.mark.parametrize("column", [3, "D"])
@pytest.mark.parametrize(
"converter", [parse, lambda x: int(x.split("/")[2])] # Produce integer.
)
def test_converters(all_parsers, column, converter):
parser = all_parsers
data = """A,B,C,D
a,1,2,01/01/2009
b,3,4,01/02/2009
c,4,5,01/03/2009
"""
if parser.engine == "pyarrow":
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), converters={column: converter})
return
result = parser.read_csv(StringIO(data), converters={column: converter})
expected = parser.read_csv(StringIO(data))
expected["D"] = expected["D"].map(converter)
tm.assert_frame_equal(result, expected)
def test_converters_no_implicit_conv(all_parsers):
# see gh-2184
parser = all_parsers
data = """000102,1.2,A\n001245,2,B"""
converters = {0: lambda x: x.strip()}
if parser.engine == "pyarrow":
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), header=None, converters=converters)
return
result = parser.read_csv(StringIO(data), header=None, converters=converters)
# Column 0 should not be casted to numeric and should remain as object.
expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]])
tm.assert_frame_equal(result, expected)
def test_converters_euro_decimal_format(all_parsers):
# see gh-583
converters = {}
parser = all_parsers
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,7387
2;121,12;14897,76;DEF;uyt;0,3773
3;878,158;108013,434;GHI;rez;2,7356"""
converters["Number1"] = converters["Number2"] = converters[
"Number3"
] = lambda x: float(x.replace(",", "."))
if parser.engine == "pyarrow":
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), sep=";", converters=converters)
return
result = parser.read_csv(StringIO(data), sep=";", converters=converters)
expected = DataFrame(
[
[1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387],
[2, 121.12, 14897.76, "DEF", "uyt", 0.3773],
[3, 878.158, 108013.434, "GHI", "rez", 2.7356],
],
columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
)
tm.assert_frame_equal(result, expected)
def test_converters_corner_with_nans(all_parsers):
parser = all_parsers
data = """id,score,days
1,2,12
2,2-5,
3,,14+
4,6-12,2"""
# Example converters.
def convert_days(x):
x = x.strip()
if not x:
return np.nan
is_plus = x.endswith("+")
if is_plus:
x = int(x[:-1]) + 1
else:
x = int(x)
return x
def convert_days_sentinel(x):
x = x.strip()
if not x:
return np.nan
is_plus = x.endswith("+")
if is_plus:
x = int(x[:-1]) + 1
else:
x = int(x)
return x
def convert_score(x):
x = x.strip()
if not x:
return np.nan
if x.find("-") > 0:
val_min, val_max = map(int, x.split("-"))
val = 0.5 * (val_min + val_max)
else:
val = float(x)
return val
results = []
for day_converter in [convert_days, convert_days_sentinel]:
if parser.engine == "pyarrow":
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data),
converters={"score": convert_score, "days": day_converter},
na_values=["", None],
)
continue
result = parser.read_csv(
StringIO(data),
converters={"score": convert_score, "days": day_converter},
na_values=["", None],
)
assert pd.isna(result["days"][1])
results.append(result)
if parser.engine != "pyarrow":
tm.assert_frame_equal(results[0], results[1])
@pytest.mark.parametrize("conv_f", [lambda x: x, str])
def test_converter_index_col_bug(all_parsers, conv_f):
# see gh-1835 , GH#40589
parser = all_parsers
data = "A;B\n1;2\n3;4"
if parser.engine == "pyarrow":
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
)
return
rs = parser.read_csv(
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
)
xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object"))
tm.assert_frame_equal(rs, xp)
def test_converter_identity_object(all_parsers):
# GH#40589
parser = all_parsers
data = "A,B\n1,2\n3,4"
if parser.engine == "pyarrow":
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), converters={"A": lambda x: x})
return
rs = parser.read_csv(StringIO(data), converters={"A": lambda x: x})
xp = DataFrame({"A": ["1", "3"], "B": [2, 4]})
tm.assert_frame_equal(rs, xp)
def test_converter_multi_index(all_parsers):
# GH 42446
parser = all_parsers
data = "A,B,B\nX,Y,Z\n1,2,3"
if parser.engine == "pyarrow":
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data),
header=list(range(2)),
converters={
("A", "X"): np.int32,
("B", "Y"): np.int32,
("B", "Z"): np.float32,
},
)
return
result = parser.read_csv(
StringIO(data),
header=list(range(2)),
converters={
("A", "X"): np.int32,
("B", "Y"): np.int32,
("B", "Z"): np.float32,
},
)
expected = DataFrame(
{
("A", "X"): np.int32([1]),
("B", "Y"): np.int32([2]),
("B", "Z"): np.float32([3]),
}
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,195 @@
"""
Tests that dialects are properly handled during parsing
for all of the parsers defined in parsers.py
"""
import csv
from io import StringIO
import pytest
from pandas.errors import ParserWarning
from pandas import DataFrame
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.fixture
def custom_dialect():
dialect_name = "weird"
dialect_kwargs = {
"doublequote": False,
"escapechar": "~",
"delimiter": ":",
"skipinitialspace": False,
"quotechar": "`",
"quoting": 3,
}
return dialect_name, dialect_kwargs
def test_dialect(all_parsers):
parser = all_parsers
data = """\
label1,label2,label3
index1,"a,c,e
index2,b,d,f
"""
dia = csv.excel()
dia.quoting = csv.QUOTE_NONE
if parser.engine == "pyarrow":
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dialect=dia)
return
df = parser.read_csv(StringIO(data), dialect=dia)
data = """\
label1,label2,label3
index1,a,c,e
index2,b,d,f
"""
exp = parser.read_csv(StringIO(data))
exp.replace("a", '"a', inplace=True)
tm.assert_frame_equal(df, exp)
def test_dialect_str(all_parsers):
dialect_name = "mydialect"
parser = all_parsers
data = """\
fruit:vegetable
apple:broccoli
pear:tomato
"""
exp = DataFrame({"fruit": ["apple", "pear"], "vegetable": ["broccoli", "tomato"]})
with tm.with_csv_dialect(dialect_name, delimiter=":"):
if parser.engine == "pyarrow":
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dialect=dialect_name)
return
df = parser.read_csv(StringIO(data), dialect=dialect_name)
tm.assert_frame_equal(df, exp)
def test_invalid_dialect(all_parsers):
class InvalidDialect:
pass
data = "a\n1"
parser = all_parsers
msg = "Invalid dialect"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dialect=InvalidDialect)
@pytest.mark.parametrize(
"arg",
[None, "doublequote", "escapechar", "skipinitialspace", "quotechar", "quoting"],
)
@pytest.mark.parametrize("value", ["dialect", "default", "other"])
def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, arg, value):
# see gh-23761.
dialect_name, dialect_kwargs = custom_dialect
parser = all_parsers
expected = DataFrame({"a": [1], "b": [2]})
data = "a:b\n1:2"
warning_klass = None
kwds = {}
# arg=None tests when we pass in the dialect without any other arguments.
if arg is not None:
if value == "dialect": # No conflict --> no warning.
kwds[arg] = dialect_kwargs[arg]
elif value == "default": # Default --> no warning.
from pandas.io.parsers.base_parser import parser_defaults
kwds[arg] = parser_defaults[arg]
else: # Non-default + conflict with dialect --> warning.
warning_klass = ParserWarning
kwds[arg] = "blah"
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
if parser.engine == "pyarrow":
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv_check_warnings(
# No warning bc we raise
None,
"Conflicting values for",
StringIO(data),
dialect=dialect_name,
**kwds,
)
return
result = parser.read_csv_check_warnings(
warning_klass,
"Conflicting values for",
StringIO(data),
dialect=dialect_name,
**kwds,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,warning_klass",
[
({"sep": ","}, None), # sep is default --> sep_override=True
({"sep": "."}, ParserWarning), # sep isn't default --> sep_override=False
({"delimiter": ":"}, None), # No conflict
({"delimiter": None}, None), # Default arguments --> sep_override=True
({"delimiter": ","}, ParserWarning), # Conflict
({"delimiter": "."}, ParserWarning), # Conflict
],
ids=[
"sep-override-true",
"sep-override-false",
"delimiter-no-conflict",
"delimiter-default-arg",
"delimiter-conflict",
"delimiter-conflict2",
],
)
def test_dialect_conflict_delimiter(all_parsers, custom_dialect, kwargs, warning_klass):
# see gh-23761.
dialect_name, dialect_kwargs = custom_dialect
parser = all_parsers
expected = DataFrame({"a": [1], "b": [2]})
data = "a:b\n1:2"
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
if parser.engine == "pyarrow":
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv_check_warnings(
# no warning bc we raise
None,
"Conflicting values for 'delimiter'",
StringIO(data),
dialect=dialect_name,
**kwargs,
)
return
result = parser.read_csv_check_warnings(
warning_klass,
"Conflicting values for 'delimiter'",
StringIO(data),
dialect=dialect_name,
**kwargs,
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,337 @@
"""
Tests encoding functionality during parsing
for all of the parsers defined in parsers.py
"""
from io import (
BytesIO,
TextIOWrapper,
)
import os
import tempfile
import uuid
import numpy as np
import pytest
from pandas import (
DataFrame,
read_csv,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
def test_bytes_io_input(all_parsers):
encoding = "cp1255"
parser = all_parsers
data = BytesIO("שלום:1234\n562:123".encode(encoding))
result = parser.read_csv(data, sep=":", encoding=encoding)
expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_read_csv_unicode(all_parsers):
parser = all_parsers
data = BytesIO("\u0141aski, Jan;1".encode())
result = parser.read_csv(data, sep=";", encoding="utf-8", header=None)
expected = DataFrame([["\u0141aski, Jan", 1]])
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize("sep", [",", "\t"])
@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
def test_utf16_bom_skiprows(all_parsers, sep, encoding):
# see gh-2298
parser = all_parsers
data = """skip this
skip this too
A,B,C
1,2,3
4,5,6""".replace(
",", sep
)
path = f"__{uuid.uuid4()}__.csv"
kwargs = {"sep": sep, "skiprows": 2}
utf8 = "utf-8"
with tm.ensure_clean(path) as path:
bytes_data = data.encode(encoding)
with open(path, "wb") as f:
f.write(bytes_data)
with TextIOWrapper(BytesIO(data.encode(utf8)), encoding=utf8) as bytes_buffer:
result = parser.read_csv(path, encoding=encoding, **kwargs)
expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs)
tm.assert_frame_equal(result, expected)
def test_utf16_example(all_parsers, csv_dir_path):
path = os.path.join(csv_dir_path, "utf16_ex.txt")
parser = all_parsers
result = parser.read_csv(path, encoding="utf-16", sep="\t")
assert len(result) == 50
def test_unicode_encoding(all_parsers, csv_dir_path):
path = os.path.join(csv_dir_path, "unicode_series.csv")
parser = all_parsers
result = parser.read_csv(path, header=None, encoding="latin-1")
result = result.set_index(0)
got = result[1][1632]
expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)"
assert got == expected
@pytest.mark.parametrize(
"data,kwargs,expected",
[
# Basic test
("a\n1", {}, DataFrame({"a": [1]})),
# "Regular" quoting
('"a"\n1', {"quotechar": '"'}, DataFrame({"a": [1]})),
# Test in a data row instead of header
("b\n1", {"names": ["a"]}, DataFrame({"a": ["b", "1"]})),
# Test in empty data row with skipping
("\n1", {"names": ["a"], "skip_blank_lines": True}, DataFrame({"a": [1]})),
# Test in empty data row without skipping
(
"\n1",
{"names": ["a"], "skip_blank_lines": False},
DataFrame({"a": [np.nan, 1]}),
),
],
)
def test_utf8_bom(all_parsers, data, kwargs, expected, request):
# see gh-4793
parser = all_parsers
bom = "\ufeff"
utf8 = "utf-8"
def _encode_data_with_bom(_data):
bom_data = (bom + _data).encode(utf8)
return BytesIO(bom_data)
if (
parser.engine == "pyarrow"
and data == "\n1"
and kwargs.get("skip_blank_lines", True)
):
# CSV parse error: Empty CSV file or block: cannot infer number of columns
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs)
tm.assert_frame_equal(result, expected)
def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
# see gh-13549
expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
parser = all_parsers
encoding = encoding_fmt.format(utf_value)
data = "mb_num,multibyte\n4.8,test".encode(encoding)
result = parser.read_csv(BytesIO(data), encoding=encoding)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"file_path,encoding",
[
(("io", "data", "csv", "test1.csv"), "utf-8"),
(("io", "parser", "data", "unicode_series.csv"), "latin-1"),
(("io", "parser", "data", "sauron.SHIFT_JIS.csv"), "shiftjis"),
],
)
def test_binary_mode_file_buffers(all_parsers, file_path, encoding, datapath):
# gh-23779: Python csv engine shouldn't error on files opened in binary.
# gh-31575: Python csv engine shouldn't error on files opened in raw binary.
parser = all_parsers
fpath = datapath(*file_path)
expected = parser.read_csv(fpath, encoding=encoding)
with open(fpath, encoding=encoding) as fa:
result = parser.read_csv(fa)
assert not fa.closed
tm.assert_frame_equal(expected, result)
with open(fpath, mode="rb") as fb:
result = parser.read_csv(fb, encoding=encoding)
assert not fb.closed
tm.assert_frame_equal(expected, result)
with open(fpath, mode="rb", buffering=0) as fb:
result = parser.read_csv(fb, encoding=encoding)
assert not fb.closed
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize("pass_encoding", [True, False])
def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding):
# see gh-24130
parser = all_parsers
encoding = encoding_fmt.format(utf_value)
if parser.engine == "pyarrow" and pass_encoding is True and utf_value in [16, 32]:
# FIXME: this is bad!
pytest.skip("These cases freeze")
expected = DataFrame({"foo": ["bar"]})
with tm.ensure_clean(mode="w+", encoding=encoding, return_filelike=True) as f:
f.write("foo\nbar")
f.seek(0)
result = parser.read_csv(f, encoding=encoding if pass_encoding else None)
tm.assert_frame_equal(result, expected)
def test_encoding_named_temp_file(all_parsers):
# see gh-31819
parser = all_parsers
encoding = "shift-jis"
title = "てすと"
data = "こむ"
expected = DataFrame({title: [data]})
with tempfile.NamedTemporaryFile() as f:
f.write(f"{title}\n{data}".encode(encoding))
f.seek(0)
result = parser.read_csv(f, encoding=encoding)
tm.assert_frame_equal(result, expected)
assert not f.closed
@pytest.mark.parametrize(
"encoding", ["utf-8", "utf-16", "utf-16-be", "utf-16-le", "utf-32"]
)
def test_parse_encoded_special_characters(encoding):
# GH16218 Verify parsing of data with encoded special characters
# Data contains a Unicode 'FULLWIDTH COLON' (U+FF1A) at position (0,"a")
data = "a\tb\nfoo\t0\nbar\t1\nbaz\t2" # noqa: RUF001
encoded_data = BytesIO(data.encode(encoding))
result = read_csv(encoded_data, delimiter="\t", encoding=encoding)
expected = DataFrame(
data=[["foo", 0], ["bar", 1], ["baz", 2]], # noqa: RUF001
columns=["a", "b"],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"])
def test_encoding_memory_map(all_parsers, encoding):
# GH40986
parser = all_parsers
expected = DataFrame(
{
"name": ["Raphael", "Donatello", "Miguel Angel", "Leonardo"],
"mask": ["red", "purple", "orange", "blue"],
"weapon": ["sai", "bo staff", "nunchunk", "katana"],
}
)
with tm.ensure_clean() as file:
expected.to_csv(file, index=False, encoding=encoding)
if parser.engine == "pyarrow":
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(file, encoding=encoding, memory_map=True)
return
df = parser.read_csv(file, encoding=encoding, memory_map=True)
tm.assert_frame_equal(df, expected)
def test_chunk_splits_multibyte_char(all_parsers):
"""
Chunk splits a multibyte character with memory_map=True
GH 43540
"""
parser = all_parsers
# DEFAULT_CHUNKSIZE = 262144, defined in parsers.pyx
df = DataFrame(data=["a" * 127] * 2048)
# Put two-bytes utf-8 encoded character "ą" at the end of chunk
# utf-8 encoding of "ą" is b'\xc4\x85'
df.iloc[2047] = "a" * 127 + "ą"
with tm.ensure_clean("bug-gh43540.csv") as fname:
df.to_csv(fname, index=False, header=False, encoding="utf-8")
if parser.engine == "pyarrow":
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(fname, header=None, memory_map=True)
return
dfr = parser.read_csv(fname, header=None, memory_map=True)
tm.assert_frame_equal(dfr, df)
def test_readcsv_memmap_utf8(all_parsers):
"""
GH 43787
Test correct handling of UTF-8 chars when memory_map=True and encoding is UTF-8
"""
lines = []
line_length = 128
start_char = " "
end_char = "\U00010080"
# This for loop creates a list of 128-char strings
# consisting of consecutive Unicode chars
for lnum in range(ord(start_char), ord(end_char), line_length):
line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n"
try:
line.encode("utf-8")
except UnicodeEncodeError:
continue
lines.append(line)
parser = all_parsers
df = DataFrame(lines)
with tm.ensure_clean("utf8test.csv") as fname:
df.to_csv(fname, index=False, header=False, encoding="utf-8")
if parser.engine == "pyarrow":
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8")
return
dfr = parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8")
tm.assert_frame_equal(df, dfr)
@pytest.mark.usefixtures("pyarrow_xfail")
@pytest.mark.parametrize("mode", ["w+b", "w+t"])
def test_not_readable(all_parsers, mode):
# GH43439
parser = all_parsers
content = b"abcd"
if "t" in mode:
content = "abcd"
with tempfile.SpooledTemporaryFile(mode=mode, encoding="utf-8") as handle:
handle.write(content)
handle.seek(0)
df = parser.read_csv(handle)
expected = DataFrame([], columns=["abcd"])
tm.assert_frame_equal(df, expected)

View File

@ -0,0 +1,733 @@
"""
Tests that the file header is properly handled or inferred
during parsing for all of the parsers defined in parsers.py
"""
from collections import namedtuple
from io import StringIO
import numpy as np
import pytest
from pandas.errors import ParserError
from pandas import (
DataFrame,
Index,
MultiIndex,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@xfail_pyarrow # TypeError: an integer is required
def test_read_with_bad_header(all_parsers):
parser = all_parsers
msg = r"but only \d+ lines in file"
with pytest.raises(ValueError, match=msg):
s = StringIO(",,")
parser.read_csv(s, header=[10])
def test_negative_header(all_parsers):
# see gh-27779
parser = all_parsers
data = """1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""
with pytest.raises(
ValueError,
match="Passing negative integer to header is invalid. "
"For no header, use header=None instead",
):
parser.read_csv(StringIO(data), header=-1)
@pytest.mark.parametrize("header", [([-1, 2, 4]), ([-5, 0])])
def test_negative_multi_index_header(all_parsers, header):
# see gh-27779
parser = all_parsers
data = """1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""
with pytest.raises(
ValueError, match="cannot specify multi-index header with negative integers"
):
parser.read_csv(StringIO(data), header=header)
@pytest.mark.parametrize("header", [True, False])
def test_bool_header_arg(all_parsers, header):
# see gh-6114
parser = all_parsers
data = """\
MyColumn
a
b
a
b"""
msg = "Passing a bool to header is invalid"
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), header=header)
@xfail_pyarrow # AssertionError: DataFrame are different
def test_header_with_index_col(all_parsers):
parser = all_parsers
data = """foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
names = ["A", "B", "C"]
result = parser.read_csv(StringIO(data), names=names)
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=["foo", "bar", "baz"],
columns=["A", "B", "C"],
)
tm.assert_frame_equal(result, expected)
def test_header_not_first_line(all_parsers):
parser = all_parsers
data = """got,to,ignore,this,line
got,to,ignore,this,line
index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
"""
data2 = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
"""
result = parser.read_csv(StringIO(data), header=2, index_col=0)
expected = parser.read_csv(StringIO(data2), header=0, index_col=0)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
def test_header_multi_index(all_parsers):
parser = all_parsers
data = """\
C0,,C_l0_g0,C_l0_g1,C_l0_g2
C1,,C_l1_g0,C_l1_g1,C_l1_g2
C2,,C_l2_g0,C_l2_g1,C_l2_g2
C3,,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,,,
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
"""
result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1])
data_gen_f = lambda r, c: f"R{r}C{c}"
data = [[data_gen_f(r, c) for c in range(3)] for r in range(5)]
index = MultiIndex.from_arrays(
[[f"R_l0_g{i}" for i in range(5)], [f"R_l1_g{i}" for i in range(5)]],
names=["R0", "R1"],
)
columns = MultiIndex.from_arrays(
[
[f"C_l0_g{i}" for i in range(3)],
[f"C_l1_g{i}" for i in range(3)],
[f"C_l2_g{i}" for i in range(3)],
[f"C_l3_g{i}" for i in range(3)],
],
names=["C0", "C1", "C2", "C3"],
)
expected = DataFrame(data, columns=columns, index=index)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,msg",
[
(
{"index_col": ["foo", "bar"]},
(
"index_col must only contain "
"row numbers when specifying "
"a multi-index header"
),
),
(
{"index_col": [0, 1], "names": ["foo", "bar"]},
("cannot specify names when specifying a multi-index header"),
),
(
{"index_col": [0, 1], "usecols": ["foo", "bar"]},
("cannot specify usecols when specifying a multi-index header"),
),
],
)
def test_header_multi_index_invalid(all_parsers, kwargs, msg):
data = """\
C0,,C_l0_g0,C_l0_g1,C_l0_g2
C1,,C_l1_g0,C_l1_g1,C_l1_g2
C2,,C_l2_g0,C_l2_g1,C_l2_g2
C3,,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,,,
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
"""
parser = all_parsers
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), header=[0, 1, 2, 3], **kwargs)
_TestTuple = namedtuple("_TestTuple", ["first", "second"])
@xfail_pyarrow # TypeError: an integer is required
@pytest.mark.parametrize(
"kwargs",
[
{"header": [0, 1]},
{
"skiprows": 3,
"names": [
("a", "q"),
("a", "r"),
("a", "s"),
("b", "t"),
("c", "u"),
("c", "v"),
],
},
{
"skiprows": 3,
"names": [
_TestTuple("a", "q"),
_TestTuple("a", "r"),
_TestTuple("a", "s"),
_TestTuple("b", "t"),
_TestTuple("c", "u"),
_TestTuple("c", "v"),
],
},
],
)
def test_header_multi_index_common_format1(all_parsers, kwargs):
parser = all_parsers
expected = DataFrame(
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
index=["one", "two"],
columns=MultiIndex.from_tuples(
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
),
)
data = """,a,a,a,b,c,c
,q,r,s,t,u,v
,,,,,,
one,1,2,3,4,5,6
two,7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
@pytest.mark.parametrize(
"kwargs",
[
{"header": [0, 1]},
{
"skiprows": 2,
"names": [
("a", "q"),
("a", "r"),
("a", "s"),
("b", "t"),
("c", "u"),
("c", "v"),
],
},
{
"skiprows": 2,
"names": [
_TestTuple("a", "q"),
_TestTuple("a", "r"),
_TestTuple("a", "s"),
_TestTuple("b", "t"),
_TestTuple("c", "u"),
_TestTuple("c", "v"),
],
},
],
)
def test_header_multi_index_common_format2(all_parsers, kwargs):
parser = all_parsers
expected = DataFrame(
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
index=["one", "two"],
columns=MultiIndex.from_tuples(
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
),
)
data = """,a,a,a,b,c,c
,q,r,s,t,u,v
one,1,2,3,4,5,6
two,7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
@pytest.mark.parametrize(
"kwargs",
[
{"header": [0, 1]},
{
"skiprows": 2,
"names": [
("a", "q"),
("a", "r"),
("a", "s"),
("b", "t"),
("c", "u"),
("c", "v"),
],
},
{
"skiprows": 2,
"names": [
_TestTuple("a", "q"),
_TestTuple("a", "r"),
_TestTuple("a", "s"),
_TestTuple("b", "t"),
_TestTuple("c", "u"),
_TestTuple("c", "v"),
],
},
],
)
def test_header_multi_index_common_format3(all_parsers, kwargs):
parser = all_parsers
expected = DataFrame(
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
index=["one", "two"],
columns=MultiIndex.from_tuples(
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
),
)
expected = expected.reset_index(drop=True)
data = """a,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), index_col=None, **kwargs)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
def test_header_multi_index_common_format_malformed1(all_parsers):
parser = all_parsers
expected = DataFrame(
np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
index=Index([1, 7]),
columns=MultiIndex(
levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
names=["a", "q"],
),
)
data = """a,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
tm.assert_frame_equal(expected, result)
@xfail_pyarrow # TypeError: an integer is required
def test_header_multi_index_common_format_malformed2(all_parsers):
parser = all_parsers
expected = DataFrame(
np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
index=Index([1, 7]),
columns=MultiIndex(
levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
names=[None, "q"],
),
)
data = """,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
tm.assert_frame_equal(expected, result)
@xfail_pyarrow # TypeError: an integer is required
def test_header_multi_index_common_format_malformed3(all_parsers):
parser = all_parsers
expected = DataFrame(
np.array([[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"),
index=MultiIndex(levels=[[1, 7], [2, 8]], codes=[[0, 1], [0, 1]]),
columns=MultiIndex(
levels=[["a", "b", "c"], ["s", "t", "u", "v"]],
codes=[[0, 1, 2, 2], [0, 1, 2, 3]],
names=[None, "q"],
),
)
data = """,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1])
tm.assert_frame_equal(expected, result)
@xfail_pyarrow # TypeError: an integer is required
def test_header_multi_index_blank_line(all_parsers):
# GH 40442
parser = all_parsers
data = [[None, None], [1, 2], [3, 4]]
columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")])
expected = DataFrame(data, columns=columns)
data = "a,b\nA,B\n,\n1,2\n3,4"
result = parser.read_csv(StringIO(data), header=[0, 1])
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize(
"data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)]
)
def test_header_names_backward_compat(all_parsers, data, header, request):
# see gh-2539
parser = all_parsers
if parser.engine == "pyarrow" and header is not None:
mark = pytest.mark.xfail(reason="DataFrame.columns are different")
request.applymarker(mark)
expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"])
result = parser.read_csv(StringIO(data), names=["a", "b", "c"], header=header)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block: cannot infer
@pytest.mark.parametrize("kwargs", [{}, {"index_col": False}])
def test_read_only_header_no_rows(all_parsers, kwargs):
# See gh-7773
parser = all_parsers
expected = DataFrame(columns=["a", "b", "c"])
result = parser.read_csv(StringIO("a,b,c"), **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,names",
[
({}, [0, 1, 2, 3, 4]),
(
{"names": ["foo", "bar", "baz", "quux", "panda"]},
["foo", "bar", "baz", "quux", "panda"],
),
],
)
def test_no_header(all_parsers, kwargs, names):
parser = all_parsers
data = """1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""
expected = DataFrame(
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], columns=names
)
result = parser.read_csv(StringIO(data), header=None, **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("header", [["a", "b"], "string_header"])
def test_non_int_header(all_parsers, header):
# see gh-16338
msg = "header must be integer or list of integers"
data = """1,2\n3,4"""
parser = all_parsers
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), header=header)
@xfail_pyarrow # TypeError: an integer is required
def test_singleton_header(all_parsers):
# see gh-7757
data = """a,b,c\n0,1,2\n1,2,3"""
parser = all_parsers
expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
result = parser.read_csv(StringIO(data), header=[0])
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
@pytest.mark.parametrize(
"data,expected",
[
(
"A,A,A,B\none,one,one,two\n0,40,34,0.1",
DataFrame(
[[0, 40, 34, 0.1]],
columns=MultiIndex.from_tuples(
[("A", "one"), ("A", "one.1"), ("A", "one.2"), ("B", "two")]
),
),
),
(
"A,A,A,B\none,one,one.1,two\n0,40,34,0.1",
DataFrame(
[[0, 40, 34, 0.1]],
columns=MultiIndex.from_tuples(
[("A", "one"), ("A", "one.1"), ("A", "one.1.1"), ("B", "two")]
),
),
),
(
"A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1",
DataFrame(
[[0, 40, 34, 0.1, 0.1]],
columns=MultiIndex.from_tuples(
[
("A", "one"),
("A", "one.1"),
("A", "one.1.1"),
("B", "two"),
("B", "two.1"),
]
),
),
),
],
)
def test_mangles_multi_index(all_parsers, data, expected):
# see gh-18062
parser = all_parsers
result = parser.read_csv(StringIO(data), header=[0, 1])
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is requireds
@pytest.mark.parametrize("index_col", [None, [0]])
@pytest.mark.parametrize(
"columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])]
)
def test_multi_index_unnamed(all_parsers, index_col, columns):
# see gh-23687
#
# When specifying a multi-index header, make sure that
# we don't error just because one of the rows in our header
# has ALL column names containing the string "Unnamed". The
# correct condition to check is whether the row contains
# ALL columns that did not have names (and instead were given
# placeholder ones).
parser = all_parsers
header = [0, 1]
if index_col is None:
data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n"
else:
data = ",".join([""] + (columns or ["", ""])) + "\n,0,1\n0,2,3\n1,4,5\n"
result = parser.read_csv(StringIO(data), header=header, index_col=index_col)
exp_columns = []
if columns is None:
columns = ["", "", ""]
for i, col in enumerate(columns):
if not col: # Unnamed.
col = f"Unnamed: {i if index_col is None else i + 1}_level_0"
exp_columns.append(col)
columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
expected = DataFrame([[2, 3], [4, 5]], columns=columns)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Expected 2 columns, got 3
def test_names_longer_than_header_but_equal_with_data_rows(all_parsers):
# GH#38453
parser = all_parsers
data = """a, b
1,2,3
5,6,4
"""
result = parser.read_csv(StringIO(data), header=0, names=["A", "B", "C"])
expected = DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 4]})
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
def test_read_csv_multiindex_columns(all_parsers):
# GH#6051
parser = all_parsers
s1 = "Male, Male, Male, Female, Female\nR, R, L, R, R\n.86, .67, .88, .78, .81"
s2 = (
"Male, Male, Male, Female, Female\n"
"R, R, L, R, R\n"
".86, .67, .88, .78, .81\n"
".86, .67, .88, .78, .82"
)
mi = MultiIndex.from_tuples(
[
("Male", "R"),
(" Male", " R"),
(" Male", " L"),
(" Female", " R"),
(" Female", " R.1"),
]
)
expected = DataFrame(
[[0.86, 0.67, 0.88, 0.78, 0.81], [0.86, 0.67, 0.88, 0.78, 0.82]], columns=mi
)
df1 = parser.read_csv(StringIO(s1), header=[0, 1])
tm.assert_frame_equal(df1, expected.iloc[:1])
df2 = parser.read_csv(StringIO(s2), header=[0, 1])
tm.assert_frame_equal(df2, expected)
@xfail_pyarrow # TypeError: an integer is required
def test_read_csv_multi_header_length_check(all_parsers):
# GH#43102
parser = all_parsers
case = """row11,row12,row13
row21,row22, row23
row31,row32
"""
with pytest.raises(
ParserError, match="Header rows must have an equal number of columns."
):
parser.read_csv(StringIO(case), header=[0, 2])
@skip_pyarrow # CSV parse error: Expected 3 columns, got 2
def test_header_none_and_implicit_index(all_parsers):
# GH#22144
parser = all_parsers
data = "x,1,5\ny,2\nz,3\n"
result = parser.read_csv(StringIO(data), names=["a", "b"], header=None)
expected = DataFrame(
{"a": [1, 2, 3], "b": [5, np.nan, np.nan]}, index=["x", "y", "z"]
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # regex mismatch "CSV parse error: Expected 2 columns, got "
def test_header_none_and_implicit_index_in_second_row(all_parsers):
# GH#22144
parser = all_parsers
data = "x,1\ny,2,5\nz,3\n"
with pytest.raises(ParserError, match="Expected 2 fields in line 2, saw 3"):
parser.read_csv(StringIO(data), names=["a", "b"], header=None)
def test_header_none_and_on_bad_lines_skip(all_parsers):
# GH#22144
parser = all_parsers
data = "x,1\ny,2,5\nz,3\n"
result = parser.read_csv(
StringIO(data), names=["a", "b"], header=None, on_bad_lines="skip"
)
expected = DataFrame({"a": ["x", "z"], "b": [1, 3]})
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is requireds
def test_header_missing_rows(all_parsers):
# GH#47400
parser = all_parsers
data = """a,b
1,2
"""
msg = r"Passed header=\[0,1,2\], len of 3, but only 2 lines in file"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), header=[0, 1, 2])
# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine
@xfail_pyarrow
def test_header_multiple_whitespaces(all_parsers):
# GH#54931
parser = all_parsers
data = """aa bb(1,1) cc(1,1)
0 2 3.5"""
result = parser.read_csv(StringIO(data), sep=r"\s+")
expected = DataFrame({"aa": [0], "bb(1,1)": 2, "cc(1,1)": 3.5})
tm.assert_frame_equal(result, expected)
# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine
@xfail_pyarrow
def test_header_delim_whitespace(all_parsers):
# GH#54918
parser = all_parsers
data = """a,b
1,2
3,4
"""
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(StringIO(data), delim_whitespace=True)
expected = DataFrame({"a,b": ["1,2", "3,4"]})
tm.assert_frame_equal(result, expected)
def test_usecols_no_header_pyarrow(pyarrow_parser_only):
parser = pyarrow_parser_only
data = """
a,i,x
b,j,y
"""
result = parser.read_csv(
StringIO(data),
header=None,
usecols=[0, 1],
dtype="string[pyarrow]",
dtype_backend="pyarrow",
engine="pyarrow",
)
expected = DataFrame([["a", "i"], ["b", "j"]], dtype="string[pyarrow]")
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,376 @@
"""
Tests that the specified index column (a.k.a "index_col")
is properly handled or inferred during parsing for all of
the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
MultiIndex,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@pytest.mark.parametrize("with_header", [True, False])
def test_index_col_named(all_parsers, with_header):
parser = all_parsers
no_header = """\
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n"
if with_header:
data = header + no_header
result = parser.read_csv(StringIO(data), index_col="ID")
expected = parser.read_csv(StringIO(data), header=0).set_index("ID")
tm.assert_frame_equal(result, expected)
else:
data = no_header
msg = "Index ID invalid"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), index_col="ID")
def test_index_col_named2(all_parsers):
parser = all_parsers
data = """\
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
"""
expected = DataFrame(
{"a": [1, 5, 9], "b": [2, 6, 10], "c": [3, 7, 11], "d": [4, 8, 12]},
index=Index(["hello", "world", "foo"], name="message"),
)
names = ["a", "b", "c", "d", "message"]
result = parser.read_csv(StringIO(data), names=names, index_col=["message"])
tm.assert_frame_equal(result, expected)
def test_index_col_is_true(all_parsers):
# see gh-9798
data = "a,b\n1,2"
parser = all_parsers
msg = "The value of index_col couldn't be 'True'"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), index_col=True)
@skip_pyarrow # CSV parse error: Expected 3 columns, got 4
def test_infer_index_col(all_parsers):
data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
parser = all_parsers
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=["foo", "bar", "baz"],
columns=["A", "B", "C"],
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
@pytest.mark.parametrize(
"index_col,kwargs",
[
(None, {"columns": ["x", "y", "z"]}),
(False, {"columns": ["x", "y", "z"]}),
(0, {"columns": ["y", "z"], "index": Index([], name="x")}),
(1, {"columns": ["x", "z"], "index": Index([], name="y")}),
("x", {"columns": ["y", "z"], "index": Index([], name="x")}),
("y", {"columns": ["x", "z"], "index": Index([], name="y")}),
(
[0, 1],
{
"columns": ["z"],
"index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]),
},
),
(
["x", "y"],
{
"columns": ["z"],
"index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]),
},
),
(
[1, 0],
{
"columns": ["z"],
"index": MultiIndex.from_arrays([[]] * 2, names=["y", "x"]),
},
),
(
["y", "x"],
{
"columns": ["z"],
"index": MultiIndex.from_arrays([[]] * 2, names=["y", "x"]),
},
),
],
)
def test_index_col_empty_data(all_parsers, index_col, kwargs):
data = "x,y,z"
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=index_col)
expected = DataFrame(**kwargs)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_empty_with_index_col_false(all_parsers):
# see gh-10413
data = "x,y"
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=False)
expected = DataFrame(columns=["x", "y"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"index_names",
[
["", ""],
["foo", ""],
["", "bar"],
["foo", "bar"],
["NotReallyUnnamed", "Unnamed: 0"],
],
)
def test_multi_index_naming(all_parsers, index_names, request):
parser = all_parsers
if parser.engine == "pyarrow" and "" in index_names:
mark = pytest.mark.xfail(reason="One case raises, others are wrong")
request.applymarker(mark)
# We don't want empty index names being replaced with "Unnamed: 0"
data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
result = parser.read_csv(StringIO(data), index_col=[0, 1])
expected = DataFrame(
{"col": [1, 2, 3, 4]}, index=MultiIndex.from_product([["a", "b"], ["c", "d"]])
)
expected.index.names = [name if name else None for name in index_names]
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: Found non-unique column index
def test_multi_index_naming_not_all_at_beginning(all_parsers):
parser = all_parsers
data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
result = parser.read_csv(StringIO(data), index_col=[0, 2])
expected = DataFrame(
{"Unnamed: 2": ["c", "d", "c", "d"]},
index=MultiIndex(
levels=[["a", "b"], [1, 2, 3, 4]], codes=[[0, 0, 1, 1], [0, 1, 2, 3]]
),
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: Found non-unique column index
def test_no_multi_index_level_names_empty(all_parsers):
# GH 10984
parser = all_parsers
midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)])
expected = DataFrame(
np.random.default_rng(2).standard_normal((3, 3)),
index=midx,
columns=["x", "y", "z"],
)
with tm.ensure_clean() as path:
expected.to_csv(path)
result = parser.read_csv(path, index_col=[0, 1, 2])
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
def test_header_with_index_col(all_parsers):
# GH 33476
parser = all_parsers
data = """
I11,A,A
I12,B,B
I2,1,3
"""
midx = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"])
idx = Index(["I2"])
expected = DataFrame([[1, 3]], index=idx, columns=midx)
result = parser.read_csv(StringIO(data), index_col=0, header=[0, 1])
tm.assert_frame_equal(result, expected)
col_idx = Index(["A", "A.1"])
idx = Index(["I12", "I2"], name="I11")
expected = DataFrame([["B", "B"], ["1", "3"]], index=idx, columns=col_idx)
result = parser.read_csv(StringIO(data), index_col="I11", header=0)
tm.assert_frame_equal(result, expected)
@pytest.mark.slow
def test_index_col_large_csv(all_parsers, monkeypatch):
# https://github.com/pandas-dev/pandas/issues/37094
parser = all_parsers
ARR_LEN = 100
df = DataFrame(
{
"a": range(ARR_LEN + 1),
"b": np.random.default_rng(2).standard_normal(ARR_LEN + 1),
}
)
with tm.ensure_clean() as path:
df.to_csv(path, index=False)
with monkeypatch.context() as m:
m.setattr("pandas.core.algorithms._MINIMUM_COMP_ARR_LEN", ARR_LEN)
result = parser.read_csv(path, index_col=[0])
tm.assert_frame_equal(result, df.set_index("a"))
@xfail_pyarrow # TypeError: an integer is required
def test_index_col_multiindex_columns_no_data(all_parsers):
# GH#38292
parser = all_parsers
result = parser.read_csv(
StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1], index_col=0
)
expected = DataFrame(
[],
index=Index([]),
columns=MultiIndex.from_arrays(
[["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"]
),
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
def test_index_col_header_no_data(all_parsers):
# GH#38292
parser = all_parsers
result = parser.read_csv(StringIO("a0,a1,a2\n"), header=[0], index_col=0)
expected = DataFrame(
[],
columns=["a1", "a2"],
index=Index([], name="a0"),
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
def test_multiindex_columns_no_data(all_parsers):
# GH#38292
parser = all_parsers
result = parser.read_csv(StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1])
expected = DataFrame(
[], columns=MultiIndex.from_arrays([["a0", "a1", "a2"], ["b0", "b1", "b2"]])
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
def test_multiindex_columns_index_col_with_data(all_parsers):
# GH#38292
parser = all_parsers
result = parser.read_csv(
StringIO("a0,a1,a2\nb0,b1,b2\ndata,data,data"), header=[0, 1], index_col=0
)
expected = DataFrame(
[["data", "data"]],
columns=MultiIndex.from_arrays(
[["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"]
),
index=Index(["data"]),
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_infer_types_boolean_sum(all_parsers):
# GH#44079
parser = all_parsers
result = parser.read_csv(
StringIO("0,1"),
names=["a", "b"],
index_col=["a"],
dtype={"a": "UInt8"},
)
expected = DataFrame(
data={
"a": [
0,
],
"b": [1],
}
).set_index("a")
# Not checking index type now, because the C parser will return a
# index column of dtype 'object', and the Python parser will return a
# index column of dtype 'int64'.
tm.assert_frame_equal(result, expected, check_index_type=False)
@pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)])
def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
# GH#9435
data = "a,b\n01,2"
parser = all_parsers
if dtype == object and parser.engine == "pyarrow":
request.applymarker(
pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine")
)
result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype})
expected = DataFrame({"b": [2]}, index=Index([val], name="a"))
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
def test_multiindex_columns_not_leading_index_col(all_parsers):
# GH#38549
parser = all_parsers
data = """a,b,c,d
e,f,g,h
x,y,1,2
"""
result = parser.read_csv(
StringIO(data),
header=[0, 1],
index_col=1,
)
cols = MultiIndex.from_tuples(
[("a", "e"), ("c", "g"), ("d", "h")], names=["b", "f"]
)
expected = DataFrame([["x", 1, 2]], columns=cols, index=["y"])
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,179 @@
"""
Tests that duplicate columns are handled appropriately when parsed by the
CSV engine. In general, the expected result is that they are either thoroughly
de-duplicated (if mangling requested) or ignored otherwise.
"""
from io import StringIO
import pytest
from pandas import DataFrame
import pandas._testing as tm
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@xfail_pyarrow # ValueError: Found non-unique column index
def test_basic(all_parsers):
parser = all_parsers
data = "a,a,b,b,b\n1,2,3,4,5"
result = parser.read_csv(StringIO(data), sep=",")
expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"])
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: Found non-unique column index
def test_basic_names(all_parsers):
# See gh-7160
parser = all_parsers
data = "a,b,a\n0,1,2\n3,4,5"
expected = DataFrame([[0, 1, 2], [3, 4, 5]], columns=["a", "b", "a.1"])
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
def test_basic_names_raise(all_parsers):
# See gh-7160
parser = all_parsers
data = "0,1,2\n3,4,5"
with pytest.raises(ValueError, match="Duplicate names"):
parser.read_csv(StringIO(data), names=["a", "b", "a"])
@xfail_pyarrow # ValueError: Found non-unique column index
@pytest.mark.parametrize(
"data,expected",
[
("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.2", "a.1"])),
(
"a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6",
DataFrame(
[[1, 2, 3, 4, 5, 6]],
columns=["a", "a.2", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
),
),
(
"a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7",
DataFrame(
[[1, 2, 3, 4, 5, 6, 7]],
columns=["a", "a.4", "a.3", "a.1", "a.2", "a.5", "a.6"],
),
),
],
)
def test_thorough_mangle_columns(all_parsers, data, expected):
# see gh-17060
parser = all_parsers
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,names,expected",
[
(
"a,b,b\n1,2,3",
["a.1", "a.1", "a.1.1"],
DataFrame(
[["a", "b", "b"], ["1", "2", "3"]], columns=["a.1", "a.1.1", "a.1.1.1"]
),
),
(
"a,b,c,d,e,f\n1,2,3,4,5,6",
["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
DataFrame(
[["a", "b", "c", "d", "e", "f"], ["1", "2", "3", "4", "5", "6"]],
columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"],
),
),
(
"a,b,c,d,e,f,g\n1,2,3,4,5,6,7",
["a", "a", "a.3", "a.1", "a.2", "a", "a"],
DataFrame(
[
["a", "b", "c", "d", "e", "f", "g"],
["1", "2", "3", "4", "5", "6", "7"],
],
columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"],
),
),
],
)
def test_thorough_mangle_names(all_parsers, data, names, expected):
# see gh-17095
parser = all_parsers
with pytest.raises(ValueError, match="Duplicate names"):
parser.read_csv(StringIO(data), names=names)
@xfail_pyarrow # AssertionError: DataFrame.columns are different
def test_mangled_unnamed_placeholders(all_parsers):
# xref gh-13017
orig_key = "0"
parser = all_parsers
orig_value = [1, 2, 3]
df = DataFrame({orig_key: orig_value})
# This test recursively updates `df`.
for i in range(3):
expected = DataFrame()
for j in range(i + 1):
col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)
expected.insert(loc=0, column=col_name, value=[0, 1, 2])
expected[orig_key] = orig_value
df = parser.read_csv(StringIO(df.to_csv()))
tm.assert_frame_equal(df, expected)
@xfail_pyarrow # ValueError: Found non-unique column index
def test_mangle_dupe_cols_already_exists(all_parsers):
# GH#14704
parser = all_parsers
data = "a,a,a.1,a,a.3,a.1,a.1.1\n1,2,3,4,5,6,7"
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[[1, 2, 3, 4, 5, 6, 7]],
columns=["a", "a.2", "a.1", "a.4", "a.3", "a.1.2", "a.1.1"],
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: Found non-unique column index
def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers):
# GH#14704
parser = all_parsers
data = ",Unnamed: 0,,Unnamed: 2\n1,2,3,4"
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[[1, 2, 3, 4]],
columns=["Unnamed: 0.1", "Unnamed: 0", "Unnamed: 2.1", "Unnamed: 2"],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("usecol, engine", [([0, 1, 1], "python"), ([0, 1, 1], "c")])
def test_mangle_cols_names(all_parsers, usecol, engine):
# GH 11823
parser = all_parsers
data = "1,2,3"
names = ["A", "A", "B"]
with pytest.raises(ValueError, match="Duplicate names"):
parser.read_csv(StringIO(data), names=names, usecols=usecol, engine=engine)

View File

@ -0,0 +1,157 @@
"""
Tests multithreading behaviour for reading and
parsing files for each parser defined in parsers.py
"""
from contextlib import ExitStack
from io import BytesIO
from multiprocessing.pool import ThreadPool
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame
import pandas._testing as tm
from pandas.util.version import Version
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
# We'll probably always skip these for pyarrow
# Maybe we'll add our own tests for pyarrow too
pytestmark = [
pytest.mark.single_cpu,
pytest.mark.slow,
]
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
def test_multi_thread_string_io_read_csv(all_parsers, request):
# see gh-11786
parser = all_parsers
if parser.engine == "pyarrow":
pa = pytest.importorskip("pyarrow")
if Version(pa.__version__) < Version("16.0"):
request.applymarker(
pytest.mark.xfail(reason="# ValueError: Found non-unique column index")
)
max_row_range = 100
num_files = 10
bytes_to_df = (
"\n".join([f"{i:d},{i:d},{i:d}" for i in range(max_row_range)]).encode()
for _ in range(num_files)
)
# Read all files in many threads.
with ExitStack() as stack:
files = [stack.enter_context(BytesIO(b)) for b in bytes_to_df]
pool = stack.enter_context(ThreadPool(8))
results = pool.map(parser.read_csv, files)
first_result = results[0]
for result in results:
tm.assert_frame_equal(first_result, result)
def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks):
"""
Generate a DataFrame via multi-thread.
Parameters
----------
parser : BaseParser
The parser object to use for reading the data.
path : str
The location of the CSV file to read.
num_rows : int
The number of rows to read per task.
num_tasks : int
The number of tasks to use for reading this DataFrame.
Returns
-------
df : DataFrame
"""
def reader(arg):
"""
Create a reader for part of the CSV.
Parameters
----------
arg : tuple
A tuple of the following:
* start : int
The starting row to start for parsing CSV
* nrows : int
The number of rows to read.
Returns
-------
df : DataFrame
"""
start, nrows = arg
if not start:
return parser.read_csv(
path, index_col=0, header=0, nrows=nrows, parse_dates=["date"]
)
return parser.read_csv(
path,
index_col=0,
header=None,
skiprows=int(start) + 1,
nrows=nrows,
parse_dates=[9],
)
tasks = [
(num_rows * i // num_tasks, num_rows // num_tasks) for i in range(num_tasks)
]
with ThreadPool(processes=num_tasks) as pool:
results = pool.map(reader, tasks)
header = results[0].columns
for r in results[1:]:
r.columns = header
final_dataframe = pd.concat(results)
return final_dataframe
@xfail_pyarrow # ValueError: The 'nrows' option is not supported
def test_multi_thread_path_multipart_read_csv(all_parsers):
# see gh-11786
num_tasks = 4
num_rows = 48
parser = all_parsers
file_name = "__thread_pool_reader__.csv"
df = DataFrame(
{
"a": np.random.default_rng(2).random(num_rows),
"b": np.random.default_rng(2).random(num_rows),
"c": np.random.default_rng(2).random(num_rows),
"d": np.random.default_rng(2).random(num_rows),
"e": np.random.default_rng(2).random(num_rows),
"foo": ["foo"] * num_rows,
"bar": ["bar"] * num_rows,
"baz": ["baz"] * num_rows,
"date": pd.date_range("20000101 09:00:00", periods=num_rows, freq="s"),
"int": np.arange(num_rows, dtype="int64"),
}
)
with tm.ensure_clean(file_name) as path:
df.to_csv(path)
final_dataframe = _generate_multi_thread_dataframe(
parser, path, num_rows, num_tasks
)
tm.assert_frame_equal(df, final_dataframe)

View File

@ -0,0 +1,771 @@
"""
Tests that NA values are properly handled during
parsing for all of the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas._libs.parsers import STR_NA_VALUES
from pandas import (
DataFrame,
Index,
MultiIndex,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
def test_string_nas(all_parsers):
parser = all_parsers
data = """A,B,C
a,b,c
d,,f
,g,h
"""
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[["a", "b", "c"], ["d", np.nan, "f"], [np.nan, "g", "h"]],
columns=["A", "B", "C"],
)
if parser.engine == "pyarrow":
expected.loc[2, "A"] = None
expected.loc[1, "B"] = None
tm.assert_frame_equal(result, expected)
def test_detect_string_na(all_parsers):
parser = all_parsers
data = """A,B
foo,bar
NA,baz
NaN,nan
"""
expected = DataFrame(
[["foo", "bar"], [np.nan, "baz"], [np.nan, np.nan]], columns=["A", "B"]
)
if parser.engine == "pyarrow":
expected.loc[[1, 2], "A"] = None
expected.loc[2, "B"] = None
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"na_values",
[
["-999.0", "-999"],
[-999, -999.0],
[-999.0, -999],
["-999.0"],
["-999"],
[-999.0],
[-999],
],
)
@pytest.mark.parametrize(
"data",
[
"""A,B
-999,1.2
2,-999
3,4.5
""",
"""A,B
-999,1.200
2,-999.000
3,4.500
""",
],
)
def test_non_string_na_values(all_parsers, data, na_values, request):
# see gh-3611: with an odd float format, we can't match
# the string "999.0" exactly but still need float matching
parser = all_parsers
expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], [3.0, 4.5]], columns=["A", "B"])
if parser.engine == "pyarrow" and not all(isinstance(x, str) for x in na_values):
msg = "The 'pyarrow' engine requires all na_values to be strings"
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), na_values=na_values)
return
elif parser.engine == "pyarrow" and "-999.000" in data:
# bc the pyarrow engine does not include the float-ified version
# of "-999" -> -999, it does not match the entry with the trailing
# zeros, so "-999.000" is not treated as null.
mark = pytest.mark.xfail(
reason="pyarrow engined does not recognize equivalent floats"
)
request.applymarker(mark)
result = parser.read_csv(StringIO(data), na_values=na_values)
tm.assert_frame_equal(result, expected)
def test_default_na_values(all_parsers):
_NA_VALUES = {
"-1.#IND",
"1.#QNAN",
"1.#IND",
"-1.#QNAN",
"#N/A",
"N/A",
"n/a",
"NA",
"<NA>",
"#NA",
"NULL",
"null",
"NaN",
"nan",
"-NaN",
"-nan",
"#N/A N/A",
"",
"None",
}
assert _NA_VALUES == STR_NA_VALUES
parser = all_parsers
nv = len(_NA_VALUES)
def f(i, v):
if i == 0:
buf = ""
elif i > 0:
buf = "".join([","] * i)
buf = f"{buf}{v}"
if i < nv - 1:
joined = "".join([","] * (nv - i - 1))
buf = f"{buf}{joined}"
return buf
data = StringIO("\n".join([f(i, v) for i, v in enumerate(_NA_VALUES)]))
expected = DataFrame(np.nan, columns=range(nv), index=range(nv))
result = parser.read_csv(data, header=None)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("na_values", ["baz", ["baz"]])
def test_custom_na_values(all_parsers, na_values):
parser = all_parsers
data = """A,B,C
ignore,this,row
1,NA,3
-1.#IND,5,baz
7,8,NaN
"""
expected = DataFrame(
[[1.0, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=["A", "B", "C"]
)
if parser.engine == "pyarrow":
msg = "skiprows argument must be an integer when using engine='pyarrow'"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1])
return
result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1])
tm.assert_frame_equal(result, expected)
def test_bool_na_values(all_parsers):
data = """A,B,C
True,False,True
NA,True,False
False,NA,True"""
parser = all_parsers
result = parser.read_csv(StringIO(data))
expected = DataFrame(
{
"A": np.array([True, np.nan, False], dtype=object),
"B": np.array([False, True, np.nan], dtype=object),
"C": [True, False, True],
}
)
if parser.engine == "pyarrow":
expected.loc[1, "A"] = None
expected.loc[2, "B"] = None
tm.assert_frame_equal(result, expected)
def test_na_value_dict(all_parsers):
data = """A,B,C
foo,bar,NA
bar,foo,foo
foo,bar,NA
bar,foo,foo"""
parser = all_parsers
if parser.engine == "pyarrow":
msg = "pyarrow engine doesn't support passing a dict for na_values"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]})
return
df = parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]})
expected = DataFrame(
{
"A": [np.nan, "bar", np.nan, "bar"],
"B": [np.nan, "foo", np.nan, "foo"],
"C": [np.nan, "foo", np.nan, "foo"],
}
)
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize(
"index_col,expected",
[
(
[0],
DataFrame({"b": [np.nan], "c": [1], "d": [5]}, index=Index([0], name="a")),
),
(
[0, 2],
DataFrame(
{"b": [np.nan], "d": [5]},
index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]),
),
),
(
["a", "c"],
DataFrame(
{"b": [np.nan], "d": [5]},
index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]),
),
),
],
)
def test_na_value_dict_multi_index(all_parsers, index_col, expected):
data = """\
a,b,c,d
0,NA,1,5
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), na_values=set(), index_col=index_col)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,expected",
[
(
{},
DataFrame(
{
"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["one", "two", "three", np.nan, "five", np.nan, "seven"],
}
),
),
(
{"na_values": {"A": [], "C": []}, "keep_default_na": False},
DataFrame(
{
"A": ["a", "b", "", "d", "e", "nan", "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["one", "two", "three", "nan", "five", "", "seven"],
}
),
),
(
{"na_values": ["a"], "keep_default_na": False},
DataFrame(
{
"A": [np.nan, "b", "", "d", "e", "nan", "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["one", "two", "three", "nan", "five", "", "seven"],
}
),
),
(
{"na_values": {"A": [], "C": []}},
DataFrame(
{
"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["one", "two", "three", np.nan, "five", np.nan, "seven"],
}
),
),
],
)
def test_na_values_keep_default(all_parsers, kwargs, expected, request):
data = """\
A,B,C
a,1,one
b,2,two
,3,three
d,4,nan
e,5,five
nan,6,
g,7,seven
"""
parser = all_parsers
if parser.engine == "pyarrow":
if "na_values" in kwargs and isinstance(kwargs["na_values"], dict):
msg = "The pyarrow engine doesn't support passing a dict for na_values"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
return
mark = pytest.mark.xfail()
request.applymarker(mark)
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
def test_no_na_values_no_keep_default(all_parsers):
# see gh-4318: passing na_values=None and
# keep_default_na=False yields 'None" as a na_value
data = """\
A,B,C
a,1,None
b,2,two
,3,None
d,4,nan
e,5,five
nan,6,
g,7,seven
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), keep_default_na=False)
expected = DataFrame(
{
"A": ["a", "b", "", "d", "e", "nan", "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["None", "two", "None", "nan", "five", "", "seven"],
}
)
tm.assert_frame_equal(result, expected)
def test_no_keep_default_na_dict_na_values(all_parsers):
# see gh-19227
data = "a,b\n,2"
parser = all_parsers
if parser.engine == "pyarrow":
msg = "The pyarrow engine doesn't support passing a dict for na_values"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data), na_values={"b": ["2"]}, keep_default_na=False
)
return
result = parser.read_csv(
StringIO(data), na_values={"b": ["2"]}, keep_default_na=False
)
expected = DataFrame({"a": [""], "b": [np.nan]})
tm.assert_frame_equal(result, expected)
def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
# see gh-19227
#
# Scalar values shouldn't cause the parsing to crash or fail.
data = "a,b\n1,2"
parser = all_parsers
if parser.engine == "pyarrow":
msg = "The pyarrow engine doesn't support passing a dict for na_values"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False)
return
df = parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False)
expected = DataFrame({"a": [1], "b": [np.nan]})
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize("col_zero_na_values", [113125, "113125"])
def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values):
# see gh-19227
data = """\
113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
729639,"qwer","",asdfkj,466.681,,252.373
"""
parser = all_parsers
expected = DataFrame(
{
0: [np.nan, 729639.0],
1: [np.nan, "qwer"],
2: ["/blaha", np.nan],
3: ["kjsdkj", "asdfkj"],
4: [412.166, 466.681],
5: ["225.874", ""],
6: [np.nan, 252.373],
}
)
if parser.engine == "pyarrow":
msg = "The pyarrow engine doesn't support passing a dict for na_values"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data),
header=None,
keep_default_na=False,
na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values},
)
return
result = parser.read_csv(
StringIO(data),
header=None,
keep_default_na=False,
na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values},
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case
@pytest.mark.parametrize(
"na_filter,row_data",
[
(True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
(False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
],
)
def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
data = """\
A,B
1,A
nan,B
3,C
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter)
expected = DataFrame(row_data, columns=["A", "B"])
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Expected 8 columns, got 5:
def test_na_trailing_columns(all_parsers):
parser = all_parsers
data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax
2012-03-14,USD,AAPL,BUY,1000
2012-05-12,USD,SBUX,SELL,500"""
# Trailing columns should be all NaN.
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[
["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan],
["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan],
],
columns=[
"Date",
"Currency",
"Symbol",
"Type",
"Units",
"UnitPrice",
"Cost",
"Tax",
],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"na_values,row_data",
[
(1, [[np.nan, 2.0], [2.0, np.nan]]),
({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]),
],
)
def test_na_values_scalar(all_parsers, na_values, row_data):
# see gh-12224
parser = all_parsers
names = ["a", "b"]
data = "1,2\n2,1"
if parser.engine == "pyarrow" and isinstance(na_values, dict):
if isinstance(na_values, dict):
err = ValueError
msg = "The pyarrow engine doesn't support passing a dict for na_values"
else:
err = TypeError
msg = "The 'pyarrow' engine requires all na_values to be strings"
with pytest.raises(err, match=msg):
parser.read_csv(StringIO(data), names=names, na_values=na_values)
return
elif parser.engine == "pyarrow":
msg = "The 'pyarrow' engine requires all na_values to be strings"
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), names=names, na_values=na_values)
return
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
expected = DataFrame(row_data, columns=names)
tm.assert_frame_equal(result, expected)
def test_na_values_dict_aliasing(all_parsers):
parser = all_parsers
na_values = {"a": 2, "b": 1}
na_values_copy = na_values.copy()
names = ["a", "b"]
data = "1,2\n2,1"
expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
if parser.engine == "pyarrow":
msg = "The pyarrow engine doesn't support passing a dict for na_values"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), names=names, na_values=na_values)
return
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
tm.assert_frame_equal(result, expected)
tm.assert_dict_equal(na_values, na_values_copy)
def test_na_values_dict_col_index(all_parsers):
# see gh-14203
data = "a\nfoo\n1"
parser = all_parsers
na_values = {0: "foo"}
if parser.engine == "pyarrow":
msg = "The pyarrow engine doesn't support passing a dict for na_values"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), na_values=na_values)
return
result = parser.read_csv(StringIO(data), na_values=na_values)
expected = DataFrame({"a": [np.nan, 1]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,kwargs,expected",
[
(
str(2**63) + "\n" + str(2**63 + 1),
{"na_values": [2**63]},
DataFrame([str(2**63), str(2**63 + 1)]),
),
(str(2**63) + ",1" + "\n,2", {}, DataFrame([[str(2**63), 1], ["", 2]])),
(str(2**63) + "\n1", {"na_values": [2**63]}, DataFrame([np.nan, 1])),
],
)
def test_na_values_uint64(all_parsers, data, kwargs, expected, request):
# see gh-14983
parser = all_parsers
if parser.engine == "pyarrow" and "na_values" in kwargs:
msg = "The 'pyarrow' engine requires all na_values to be strings"
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), header=None, **kwargs)
return
elif parser.engine == "pyarrow":
mark = pytest.mark.xfail(reason="Returns float64 instead of object")
request.applymarker(mark)
result = parser.read_csv(StringIO(data), header=None, **kwargs)
tm.assert_frame_equal(result, expected)
def test_empty_na_values_no_default_with_index(all_parsers):
# see gh-15835
data = "a,1\nb,2"
parser = all_parsers
expected = DataFrame({"1": [2]}, index=Index(["b"], name="a"))
result = parser.read_csv(StringIO(data), index_col=0, keep_default_na=False)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])]
)
def test_no_na_filter_on_index(all_parsers, na_filter, index_data, request):
# see gh-5239
#
# Don't parse NA-values in index unless na_filter=True
parser = all_parsers
data = "a,b,c\n1,,3\n4,5,6"
if parser.engine == "pyarrow" and na_filter is False:
mark = pytest.mark.xfail(reason="mismatched index result")
request.applymarker(mark)
expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index(index_data, name="b"))
result = parser.read_csv(StringIO(data), index_col=[1], na_filter=na_filter)
tm.assert_frame_equal(result, expected)
def test_inf_na_values_with_int_index(all_parsers):
# see gh-17128
parser = all_parsers
data = "idx,col1,col2\n1,3,4\n2,inf,-inf"
# Don't fail with OverflowError with inf's and integer index column.
out = parser.read_csv(StringIO(data), index_col=[0], na_values=["inf", "-inf"])
expected = DataFrame(
{"col1": [3, np.nan], "col2": [4, np.nan]}, index=Index([1, 2], name="idx")
)
tm.assert_frame_equal(out, expected)
@xfail_pyarrow # mismatched shape
@pytest.mark.parametrize("na_filter", [True, False])
def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
# see gh-20377
parser = all_parsers
data = "a,b,c\n1,,3\n4,5,6"
# na_filter=True --> missing value becomes NaN.
# na_filter=False --> missing value remains empty string.
empty = np.nan if na_filter else ""
expected = DataFrame({"a": ["1", "4"], "b": [empty, "5"], "c": ["3", "6"]})
result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # mismatched exception message
@pytest.mark.parametrize(
"data, na_values",
[
("false,1\n,1\ntrue", None),
("false,1\nnull,1\ntrue", None),
("false,1\nnan,1\ntrue", None),
("false,1\nfoo,1\ntrue", "foo"),
("false,1\nfoo,1\ntrue", ["foo"]),
("false,1\nfoo,1\ntrue", {"a": "foo"}),
],
)
def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
parser = all_parsers
msg = "|".join(
[
"Bool column has NA values in column [0a]",
"cannot safely convert passed user dtype of "
"bool for object dtyped data in column 0",
]
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data),
header=None,
names=["a", "b"],
dtype={"a": "bool"},
na_values=na_values,
)
# TODO: this test isn't about the na_values keyword, it is about the empty entries
# being returned with NaN entries, whereas the pyarrow engine returns "nan"
@xfail_pyarrow # mismatched shapes
def test_str_nan_dropped(all_parsers):
# see gh-21131
parser = all_parsers
data = """File: small.csv,,
10010010233,0123,654
foo,,bar
01001000155,4530,898"""
result = parser.read_csv(
StringIO(data),
header=None,
names=["col1", "col2", "col3"],
dtype={"col1": str, "col2": str, "col3": str},
).dropna()
expected = DataFrame(
{
"col1": ["10010010233", "01001000155"],
"col2": ["0123", "4530"],
"col3": ["654", "898"],
},
index=[1, 3],
)
tm.assert_frame_equal(result, expected)
def test_nan_multi_index(all_parsers):
# GH 42446
parser = all_parsers
data = "A,B,B\nX,Y,Z\n1,2,inf"
if parser.engine == "pyarrow":
msg = "The pyarrow engine doesn't support passing a dict for na_values"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"}
)
return
result = parser.read_csv(
StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"}
)
expected = DataFrame(
{
("A", "X"): [1],
("B", "Y"): [2],
("B", "Z"): [np.nan],
}
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # Failed: DID NOT RAISE <class 'ValueError'>; it casts the NaN to False
def test_bool_and_nan_to_bool(all_parsers):
# GH#42808
parser = all_parsers
data = """0
NaN
True
False
"""
with pytest.raises(ValueError, match="NA values"):
parser.read_csv(StringIO(data), dtype="bool")
def test_bool_and_nan_to_int(all_parsers):
# GH#42808
parser = all_parsers
data = """0
NaN
True
False
"""
with pytest.raises(ValueError, match="convert|NoneType"):
parser.read_csv(StringIO(data), dtype="int")
def test_bool_and_nan_to_float(all_parsers):
# GH#42808
parser = all_parsers
data = """0
NaN
True
False
"""
result = parser.read_csv(StringIO(data), dtype="float")
expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]})
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,327 @@
"""
Tests parsers ability to read and parse non-local files
and hence require a network connection to be read.
"""
from io import BytesIO
import logging
import re
import numpy as np
import pytest
import pandas.util._test_decorators as td
from pandas import DataFrame
import pandas._testing as tm
from pandas.io.feather_format import read_feather
from pandas.io.parsers import read_csv
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.mark.network
@pytest.mark.single_cpu
@pytest.mark.parametrize("mode", ["explicit", "infer"])
@pytest.mark.parametrize("engine", ["python", "c"])
def test_compressed_urls(
httpserver,
datapath,
salaries_table,
mode,
engine,
compression_only,
compression_to_extension,
):
# test reading compressed urls with various engines and
# extension inference
if compression_only == "tar":
pytest.skip("TODO: Add tar salaraies.csv to pandas/io/parsers/data")
extension = compression_to_extension[compression_only]
with open(datapath("io", "parser", "data", "salaries.csv" + extension), "rb") as f:
httpserver.serve_content(content=f.read())
url = httpserver.url + "/salaries.csv" + extension
if mode != "explicit":
compression_only = mode
url_table = read_csv(url, sep="\t", compression=compression_only, engine=engine)
tm.assert_frame_equal(url_table, salaries_table)
@pytest.mark.network
@pytest.mark.single_cpu
def test_url_encoding_csv(httpserver, datapath):
"""
read_csv should honor the requested encoding for URLs.
GH 10424
"""
with open(datapath("io", "parser", "data", "unicode_series.csv"), "rb") as f:
httpserver.serve_content(content=f.read())
df = read_csv(httpserver.url, encoding="latin-1", header=None)
assert df.loc[15, 1] == "Á köldum klaka (Cold Fever) (1994)"
@pytest.fixture
def tips_df(datapath):
"""DataFrame with the tips dataset."""
return read_csv(datapath("io", "data", "csv", "tips.csv"))
@pytest.mark.single_cpu
@pytest.mark.usefixtures("s3_resource")
@td.skip_if_not_us_locale()
class TestS3:
def test_parse_public_s3_bucket(self, s3_public_bucket_with_data, tips_df, s3so):
# more of an integration test due to the not-public contents portion
# can probably mock this though.
pytest.importorskip("s3fs")
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df = read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
compression=comp,
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
def test_parse_private_s3_bucket(self, s3_private_bucket_with_data, tips_df, s3so):
# Read public file from bucket with not-public contents
pytest.importorskip("s3fs")
df = read_csv(
f"s3://{s3_private_bucket_with_data.name}/tips.csv", storage_options=s3so
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
def test_parse_public_s3n_bucket(self, s3_public_bucket_with_data, tips_df, s3so):
# Read from AWS s3 as "s3n" URL
df = read_csv(
f"s3n://{s3_public_bucket_with_data.name}/tips.csv",
nrows=10,
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_parse_public_s3a_bucket(self, s3_public_bucket_with_data, tips_df, s3so):
# Read from AWS s3 as "s3a" URL
df = read_csv(
f"s3a://{s3_public_bucket_with_data.name}/tips.csv",
nrows=10,
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_parse_public_s3_bucket_nrows(
self, s3_public_bucket_with_data, tips_df, s3so
):
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df = read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
nrows=10,
compression=comp,
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_parse_public_s3_bucket_chunked(
self, s3_public_bucket_with_data, tips_df, s3so
):
# Read with a chunksize
chunksize = 5
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
with read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
chunksize=chunksize,
compression=comp,
storage_options=s3so,
) as df_reader:
assert df_reader.chunksize == chunksize
for i_chunk in [0, 1, 2]:
# Read a couple of chunks and make sure we see them
# properly.
df = df_reader.get_chunk()
assert isinstance(df, DataFrame)
assert not df.empty
true_df = tips_df.iloc[
chunksize * i_chunk : chunksize * (i_chunk + 1)
]
tm.assert_frame_equal(true_df, df)
def test_parse_public_s3_bucket_chunked_python(
self, s3_public_bucket_with_data, tips_df, s3so
):
# Read with a chunksize using the Python parser
chunksize = 5
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
with read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
chunksize=chunksize,
compression=comp,
engine="python",
storage_options=s3so,
) as df_reader:
assert df_reader.chunksize == chunksize
for i_chunk in [0, 1, 2]:
# Read a couple of chunks and make sure we see them properly.
df = df_reader.get_chunk()
assert isinstance(df, DataFrame)
assert not df.empty
true_df = tips_df.iloc[
chunksize * i_chunk : chunksize * (i_chunk + 1)
]
tm.assert_frame_equal(true_df, df)
def test_parse_public_s3_bucket_python(
self, s3_public_bucket_with_data, tips_df, s3so
):
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df = read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
engine="python",
compression=comp,
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
def test_infer_s3_compression(self, s3_public_bucket_with_data, tips_df, s3so):
for ext in ["", ".gz", ".bz2"]:
df = read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
engine="python",
compression="infer",
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
def test_parse_public_s3_bucket_nrows_python(
self, s3_public_bucket_with_data, tips_df, s3so
):
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df = read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
engine="python",
nrows=10,
compression=comp,
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_read_s3_fails(self, s3so):
msg = "The specified bucket does not exist"
with pytest.raises(OSError, match=msg):
read_csv("s3://nyqpug/asdf.csv", storage_options=s3so)
def test_read_s3_fails_private(self, s3_private_bucket, s3so):
msg = "The specified bucket does not exist"
# Receive a permission error when trying to read a private bucket.
# It's irrelevant here that this isn't actually a table.
with pytest.raises(OSError, match=msg):
read_csv(f"s3://{s3_private_bucket.name}/file.csv")
@pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False)
def test_write_s3_csv_fails(self, tips_df, s3so):
# GH 32486
# Attempting to write to an invalid S3 path should raise
import botocore
# GH 34087
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html
# Catch a ClientError since AWS Service Errors are defined dynamically
error = (FileNotFoundError, botocore.exceptions.ClientError)
with pytest.raises(error, match="The specified bucket does not exist"):
tips_df.to_csv(
"s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so
)
@pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False)
def test_write_s3_parquet_fails(self, tips_df, s3so):
# GH 27679
# Attempting to write to an invalid S3 path should raise
pytest.importorskip("pyarrow")
import botocore
# GH 34087
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html
# Catch a ClientError since AWS Service Errors are defined dynamically
error = (FileNotFoundError, botocore.exceptions.ClientError)
with pytest.raises(error, match="The specified bucket does not exist"):
tips_df.to_parquet(
"s3://an_s3_bucket_data_doesnt_exit/not_real.parquet",
storage_options=s3so,
)
@pytest.mark.single_cpu
def test_read_csv_handles_boto_s3_object(
self, s3_public_bucket_with_data, tips_file
):
# see gh-16135
s3_object = s3_public_bucket_with_data.Object("tips.csv")
with BytesIO(s3_object.get()["Body"].read()) as buffer:
result = read_csv(buffer, encoding="utf8")
assert isinstance(result, DataFrame)
assert not result.empty
expected = read_csv(tips_file)
tm.assert_frame_equal(result, expected)
@pytest.mark.single_cpu
def test_read_csv_chunked_download(self, s3_public_bucket, caplog, s3so):
# 8 MB, S3FS uses 5MB chunks
df = DataFrame(np.zeros((100000, 4)), columns=list("abcd"))
with BytesIO(df.to_csv().encode("utf-8")) as buf:
s3_public_bucket.put_object(Key="large-file.csv", Body=buf)
uri = f"{s3_public_bucket.name}/large-file.csv"
match_re = re.compile(rf"^Fetch: {uri}, 0-(?P<stop>\d+)$")
with caplog.at_level(logging.DEBUG, logger="s3fs"):
read_csv(
f"s3://{uri}",
nrows=5,
storage_options=s3so,
)
for log in caplog.messages:
if match := re.match(match_re, log):
# Less than 8 MB
assert int(match.group("stop")) < 8000000
def test_read_s3_with_hash_in_key(self, s3_public_bucket_with_data, tips_df, s3so):
# GH 25945
result = read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips#1.csv", storage_options=s3so
)
tm.assert_frame_equal(tips_df, result)
def test_read_feather_s3_file_path(
self, s3_public_bucket_with_data, feather_file, s3so
):
# GH 29055
pytest.importorskip("pyarrow")
expected = read_feather(feather_file)
res = read_feather(
f"s3://{s3_public_bucket_with_data.name}/simple_dataset.feather",
storage_options=s3so,
)
tm.assert_frame_equal(expected, res)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,564 @@
"""
Tests that apply specifically to the Python parser. Unless specifically
stated as a Python-specific issue, the goal is to eventually move as many of
these tests out of this module as soon as the C parser can accept further
arguments when parsing.
"""
from __future__ import annotations
import csv
from io import (
BytesIO,
StringIO,
TextIOWrapper,
)
from typing import TYPE_CHECKING
import numpy as np
import pytest
from pandas.errors import (
ParserError,
ParserWarning,
)
from pandas import (
DataFrame,
Index,
MultiIndex,
)
import pandas._testing as tm
if TYPE_CHECKING:
from collections.abc import Iterator
def test_default_separator(python_parser_only):
# see gh-17333
#
# csv.Sniffer in Python treats "o" as separator.
data = "aob\n1o2\n3o4"
parser = python_parser_only
expected = DataFrame({"a": [1, 3], "b": [2, 4]})
result = parser.read_csv(StringIO(data), sep=None)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("skipfooter", ["foo", 1.5, True])
def test_invalid_skipfooter_non_int(python_parser_only, skipfooter):
# see gh-15925 (comment)
data = "a\n1\n2"
parser = python_parser_only
msg = "skipfooter must be an integer"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), skipfooter=skipfooter)
def test_invalid_skipfooter_negative(python_parser_only):
# see gh-15925 (comment)
data = "a\n1\n2"
parser = python_parser_only
msg = "skipfooter cannot be negative"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), skipfooter=-1)
@pytest.mark.parametrize("kwargs", [{"sep": None}, {"delimiter": "|"}])
def test_sniff_delimiter(python_parser_only, kwargs):
data = """index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""
parser = python_parser_only
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=["A", "B", "C"],
index=Index(["foo", "bar", "baz"], name="index"),
)
tm.assert_frame_equal(result, expected)
def test_sniff_delimiter_comment(python_parser_only):
data = """# comment line
index|A|B|C
# comment line
foo|1|2|3 # ignore | this
bar|4|5|6
baz|7|8|9
"""
parser = python_parser_only
result = parser.read_csv(StringIO(data), index_col=0, sep=None, comment="#")
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=["A", "B", "C"],
index=Index(["foo", "bar", "baz"], name="index"),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("encoding", [None, "utf-8"])
def test_sniff_delimiter_encoding(python_parser_only, encoding):
parser = python_parser_only
data = """ignore this
ignore this too
index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""
if encoding is not None:
data = data.encode(encoding)
data = BytesIO(data)
data = TextIOWrapper(data, encoding=encoding)
else:
data = StringIO(data)
result = parser.read_csv(data, index_col=0, sep=None, skiprows=2, encoding=encoding)
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=["A", "B", "C"],
index=Index(["foo", "bar", "baz"], name="index"),
)
tm.assert_frame_equal(result, expected)
def test_single_line(python_parser_only):
# see gh-6607: sniff separator
parser = python_parser_only
result = parser.read_csv(StringIO("1,2"), names=["a", "b"], header=None, sep=None)
expected = DataFrame({"a": [1], "b": [2]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("kwargs", [{"skipfooter": 2}, {"nrows": 3}])
def test_skipfooter(python_parser_only, kwargs):
# see gh-6607
data = """A,B,C
1,2,3
4,5,6
7,8,9
want to skip this
also also skip this
"""
parser = python_parser_only
result = parser.read_csv(StringIO(data), **kwargs)
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"compression,klass", [("gzip", "GzipFile"), ("bz2", "BZ2File")]
)
def test_decompression_regex_sep(python_parser_only, csv1, compression, klass):
# see gh-6607
parser = python_parser_only
with open(csv1, "rb") as f:
data = f.read()
data = data.replace(b",", b"::")
expected = parser.read_csv(csv1)
module = pytest.importorskip(compression)
klass = getattr(module, klass)
with tm.ensure_clean() as path:
with klass(path, mode="wb") as tmp:
tmp.write(data)
result = parser.read_csv(path, sep="::", compression=compression)
tm.assert_frame_equal(result, expected)
def test_read_csv_buglet_4x_multi_index(python_parser_only):
# see gh-6607
data = """ A B C D E
one two three four
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
parser = python_parser_only
expected = DataFrame(
[
[-0.5109, -2.3358, -0.4645, 0.05076, 0.3640],
[0.4473, 1.4152, 0.2834, 1.00661, 0.1744],
[-0.6662, -0.5243, -0.3580, 0.89145, 2.5838],
],
columns=["A", "B", "C", "D", "E"],
index=MultiIndex.from_tuples(
[("a", "b", 10.0032, 5), ("a", "q", 20, 4), ("x", "q", 30, 3)],
names=["one", "two", "three", "four"],
),
)
result = parser.read_csv(StringIO(data), sep=r"\s+")
tm.assert_frame_equal(result, expected)
def test_read_csv_buglet_4x_multi_index2(python_parser_only):
# see gh-6893
data = " A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9"
parser = python_parser_only
expected = DataFrame.from_records(
[(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)],
columns=list("abcABC"),
index=list("abc"),
)
result = parser.read_csv(StringIO(data), sep=r"\s+")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("add_footer", [True, False])
def test_skipfooter_with_decimal(python_parser_only, add_footer):
# see gh-6971
data = "1#2\n3#4"
parser = python_parser_only
expected = DataFrame({"a": [1.2, 3.4]})
if add_footer:
# The stray footer line should not mess with the
# casting of the first two lines if we skip it.
kwargs = {"skipfooter": 1}
data += "\nFooter"
else:
kwargs = {}
result = parser.read_csv(StringIO(data), names=["a"], decimal="#", **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"sep", ["::", "#####", "!!!", "123", "#1!c5", "%!c!d", "@@#4:2", "_!pd#_"]
)
@pytest.mark.parametrize(
"encoding", ["utf-16", "utf-16-be", "utf-16-le", "utf-32", "cp037"]
)
def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding):
# see gh-3404
expected = DataFrame({"a": [1], "b": [2]})
parser = python_parser_only
data = "1" + sep + "2"
encoded_data = data.encode(encoding)
result = parser.read_csv(
BytesIO(encoded_data), sep=sep, names=["a", "b"], encoding=encoding
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
def test_multi_char_sep_quotes(python_parser_only, quoting):
# see gh-13374
kwargs = {"sep": ",,"}
parser = python_parser_only
data = 'a,,b\n1,,a\n2,,"2,,b"'
if quoting == csv.QUOTE_NONE:
msg = "Expected 2 fields in line 3, saw 3"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
else:
msg = "ignored when a multi-char delimiter is used"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
def test_none_delimiter(python_parser_only):
# see gh-13374 and gh-17465
parser = python_parser_only
data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
expected = DataFrame({"a": [0, 7], "b": [1, 8], "c": [2, 9]})
# We expect the third line in the data to be
# skipped because it is malformed, but we do
# not expect any errors to occur.
with tm.assert_produces_warning(
ParserWarning, match="Skipping line 3", check_stacklevel=False
):
result = parser.read_csv(
StringIO(data), header=0, sep=None, on_bad_lines="warn"
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz'])
@pytest.mark.parametrize("skipfooter", [0, 1])
def test_skipfooter_bad_row(python_parser_only, data, skipfooter):
# see gh-13879 and gh-15910
parser = python_parser_only
if skipfooter:
msg = "parsing errors in the skipped footer rows"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), skipfooter=skipfooter)
else:
msg = "unexpected end of data|expected after"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), skipfooter=skipfooter)
def test_malformed_skipfooter(python_parser_only):
parser = python_parser_only
data = """ignore
A,B,C
1,2,3 # comment
1,2,3,4,5
2,3,4
footer
"""
msg = "Expected 3 fields in line 4, saw 5"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1)
def test_python_engine_file_no_next(python_parser_only):
parser = python_parser_only
class NoNextBuffer:
def __init__(self, csv_data) -> None:
self.data = csv_data
def __iter__(self) -> Iterator:
return self.data.__iter__()
def read(self):
return self.data
def readline(self):
return self.data
parser.read_csv(NoNextBuffer("a\n1"))
@pytest.mark.parametrize("bad_line_func", [lambda x: ["2", "3"], lambda x: x[:2]])
def test_on_bad_lines_callable(python_parser_only, bad_line_func):
# GH 5686
parser = python_parser_only
data = """a,b
1,2
2,3,4,5,6
3,4
"""
bad_sio = StringIO(data)
result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
tm.assert_frame_equal(result, expected)
def test_on_bad_lines_callable_write_to_external_list(python_parser_only):
# GH 5686
parser = python_parser_only
data = """a,b
1,2
2,3,4,5,6
3,4
"""
bad_sio = StringIO(data)
lst = []
def bad_line_func(bad_line: list[str]) -> list[str]:
lst.append(bad_line)
return ["2", "3"]
result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
tm.assert_frame_equal(result, expected)
assert lst == [["2", "3", "4", "5", "6"]]
@pytest.mark.parametrize("bad_line_func", [lambda x: ["foo", "bar"], lambda x: x[:2]])
@pytest.mark.parametrize("sep", [",", "111"])
def test_on_bad_lines_callable_iterator_true(python_parser_only, bad_line_func, sep):
# GH 5686
# iterator=True has a separate code path than iterator=False
parser = python_parser_only
data = f"""
0{sep}1
hi{sep}there
foo{sep}bar{sep}baz
good{sep}bye
"""
bad_sio = StringIO(data)
result_iter = parser.read_csv(
bad_sio, on_bad_lines=bad_line_func, chunksize=1, iterator=True, sep=sep
)
expecteds = [
{"0": "hi", "1": "there"},
{"0": "foo", "1": "bar"},
{"0": "good", "1": "bye"},
]
for i, (result, expected) in enumerate(zip(result_iter, expecteds)):
expected = DataFrame(expected, index=range(i, i + 1))
tm.assert_frame_equal(result, expected)
def test_on_bad_lines_callable_dont_swallow_errors(python_parser_only):
# GH 5686
parser = python_parser_only
data = """a,b
1,2
2,3,4,5,6
3,4
"""
bad_sio = StringIO(data)
msg = "This function is buggy."
def bad_line_func(bad_line):
raise ValueError(msg)
with pytest.raises(ValueError, match=msg):
parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
def test_on_bad_lines_callable_not_expected_length(python_parser_only):
# GH 5686
parser = python_parser_only
data = """a,b
1,2
2,3,4,5,6
3,4
"""
bad_sio = StringIO(data)
result = parser.read_csv_check_warnings(
ParserWarning, "Length of header or names", bad_sio, on_bad_lines=lambda x: x
)
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
tm.assert_frame_equal(result, expected)
def test_on_bad_lines_callable_returns_none(python_parser_only):
# GH 5686
parser = python_parser_only
data = """a,b
1,2
2,3,4,5,6
3,4
"""
bad_sio = StringIO(data)
result = parser.read_csv(bad_sio, on_bad_lines=lambda x: None)
expected = DataFrame({"a": [1, 3], "b": [2, 4]})
tm.assert_frame_equal(result, expected)
def test_on_bad_lines_index_col_inferred(python_parser_only):
# GH 5686
parser = python_parser_only
data = """a,b
1,2,3
4,5,6
"""
bad_sio = StringIO(data)
result = parser.read_csv(bad_sio, on_bad_lines=lambda x: ["99", "99"])
expected = DataFrame({"a": [2, 5], "b": [3, 6]}, index=[1, 4])
tm.assert_frame_equal(result, expected)
def test_index_col_false_and_header_none(python_parser_only):
# GH#46955
parser = python_parser_only
data = """
0.5,0.03
0.1,0.2,0.3,2
"""
result = parser.read_csv_check_warnings(
ParserWarning,
"Length of header",
StringIO(data),
sep=",",
header=None,
index_col=False,
)
expected = DataFrame({0: [0.5, 0.1], 1: [0.03, 0.2]})
tm.assert_frame_equal(result, expected)
def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parser_only):
# GH#46569
parser = python_parser_only
data = StringIO("a\na,b\nc,d,e\nf,g,h")
result = parser.read_csv_check_warnings(
ParserWarning, "Length of header", data, engine="python", index_col=False
)
expected = DataFrame({"a": ["a", "c", "f"]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}]
)
def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, dtype):
# GH#50270
parser = python_parser_only
data = """\
a;b;c
0000.7995;16.000;0
3.03.001.00514;0;4.000
4923.600.041;23.000;131"""
result = parser.read_csv(
StringIO(data),
sep=";",
dtype=dtype,
thousands=".",
)
expected = DataFrame(
{
"a": ["0000.7995", "3.03.001.00514", "4923.600.041"],
"b": [16000, 0, 23000],
"c": [0, 4000, 131],
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype,expected",
[
(
{"a": str, "b": np.float64, "c": np.int64},
DataFrame(
{
"b": [16000.1, 0, 23000],
"c": [0, 4001, 131],
}
),
),
(
str,
DataFrame(
{
"b": ["16,000.1", "0", "23,000"],
"c": ["0", "4,001", "131"],
}
),
),
],
)
def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, expected):
# GH#50270
parser = python_parser_only
data = """a;b;c
0000,7995;16,000.1;0
3,03,001,00514;0;4,001
4923,600,041;23,000;131
"""
result = parser.read_csv(
StringIO(data),
sep=";",
dtype=dtype,
thousands=",",
)
expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"])
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,183 @@
"""
Tests that quoting specifications are properly handled
during parsing for all of the parsers defined in parsers.py
"""
import csv
from io import StringIO
import pytest
from pandas.compat import PY311
from pandas.errors import ParserError
from pandas import DataFrame
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@pytest.mark.parametrize(
"kwargs,msg",
[
({"quotechar": "foo"}, '"quotechar" must be a(n)? 1-character string'),
(
{"quotechar": None, "quoting": csv.QUOTE_MINIMAL},
"quotechar must be set if quoting enabled",
),
({"quotechar": 2}, '"quotechar" must be string( or None)?, not int'),
],
)
@skip_pyarrow # ParserError: CSV parse error: Empty CSV file or block
def test_bad_quote_char(all_parsers, kwargs, msg):
data = "1,2,3"
parser = all_parsers
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
@pytest.mark.parametrize(
"quoting,msg",
[
("foo", '"quoting" must be an integer|Argument'),
(10, 'bad "quoting" value'), # quoting must be in the range [0, 3]
],
)
@xfail_pyarrow # ValueError: The 'quoting' option is not supported
def test_bad_quoting(all_parsers, quoting, msg):
data = "1,2,3"
parser = all_parsers
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), quoting=quoting)
def test_quote_char_basic(all_parsers):
parser = all_parsers
data = 'a,b,c\n1,2,"cat"'
expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
result = parser.read_csv(StringIO(data), quotechar='"')
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"])
def test_quote_char_various(all_parsers, quote_char):
parser = all_parsers
expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
data = 'a,b,c\n1,2,"cat"'
new_data = data.replace('"', quote_char)
result = parser.read_csv(StringIO(new_data), quotechar=quote_char)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: The 'quoting' option is not supported
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
@pytest.mark.parametrize("quote_char", ["", None])
def test_null_quote_char(all_parsers, quoting, quote_char):
kwargs = {"quotechar": quote_char, "quoting": quoting}
data = "a,b,c\n1,2,3"
parser = all_parsers
if quoting != csv.QUOTE_NONE:
# Sanity checking.
msg = (
'"quotechar" must be a 1-character string'
if PY311 and all_parsers.engine == "python" and quote_char == ""
else "quotechar must be set if quoting enabled"
)
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
elif not (PY311 and all_parsers.engine == "python"):
# Python 3.11+ doesn't support null/blank quote chars in their csv parsers
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,exp_data",
[
({}, [[1, 2, "foo"]]), # Test default.
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
({"quotechar": '"', "quoting": csv.QUOTE_MINIMAL}, [[1, 2, "foo"]]),
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
({"quotechar": '"', "quoting": csv.QUOTE_ALL}, [[1, 2, "foo"]]),
# QUOTE_NONE tells the reader to do no special handling
# of quote characters and leave them alone.
({"quotechar": '"', "quoting": csv.QUOTE_NONE}, [[1, 2, '"foo"']]),
# QUOTE_NONNUMERIC tells the reader to cast
# all non-quoted fields to float
({"quotechar": '"', "quoting": csv.QUOTE_NONNUMERIC}, [[1.0, 2.0, "foo"]]),
],
)
@xfail_pyarrow # ValueError: The 'quoting' option is not supported
def test_quoting_various(all_parsers, kwargs, exp_data):
data = '1,2,"foo"'
parser = all_parsers
columns = ["a", "b", "c"]
result = parser.read_csv(StringIO(data), names=columns, **kwargs)
expected = DataFrame(exp_data, columns=columns)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])]
)
def test_double_quote(all_parsers, doublequote, exp_data, request):
parser = all_parsers
data = 'a,b\n3,"4 "" 5"'
if parser.engine == "pyarrow" and not doublequote:
mark = pytest.mark.xfail(reason="Mismatched result")
request.applymarker(mark)
result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote)
expected = DataFrame(exp_data, columns=["a", "b"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("quotechar", ['"', "\u0001"])
def test_quotechar_unicode(all_parsers, quotechar):
# see gh-14477
data = "a\n1"
parser = all_parsers
expected = DataFrame({"a": [1]})
result = parser.read_csv(StringIO(data), quotechar=quotechar)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("balanced", [True, False])
def test_unbalanced_quoting(all_parsers, balanced, request):
# see gh-22789.
parser = all_parsers
data = 'a,b,c\n1,2,"3'
if parser.engine == "pyarrow" and not balanced:
mark = pytest.mark.xfail(reason="Mismatched result")
request.applymarker(mark)
if balanced:
# Re-balance the quoting and read in without errors.
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
result = parser.read_csv(StringIO(data + '"'))
tm.assert_frame_equal(result, expected)
else:
msg = (
"EOF inside string starting at row 1"
if parser.engine == "c"
else "unexpected end of data"
)
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data))

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,334 @@
"""
Tests that skipped rows are properly handled during
parsing for all of the parsers defined in parsers.py
"""
from datetime import datetime
from io import StringIO
import numpy as np
import pytest
from pandas.errors import EmptyDataError
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
@pytest.mark.parametrize("skiprows", [list(range(6)), 6])
def test_skip_rows_bug(all_parsers, skiprows):
# see gh-505
parser = all_parsers
text = """#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
1/1/2000,1.,2.,3.
1/2/2000,4,5,6
1/3/2000,7,8,9
"""
result = parser.read_csv(
StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True
)
index = Index(
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
)
expected = DataFrame(
np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
def test_deep_skip_rows(all_parsers):
# see gh-4382
parser = all_parsers
data = "a,b,c\n" + "\n".join(
[",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)]
)
condensed_data = "a,b,c\n" + "\n".join(
[",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]]
)
result = parser.read_csv(StringIO(data), skiprows=[6, 8])
condensed_result = parser.read_csv(StringIO(condensed_data))
tm.assert_frame_equal(result, condensed_result)
@xfail_pyarrow # AssertionError: DataFrame are different
def test_skip_rows_blank(all_parsers):
# see gh-9832
parser = all_parsers
text = """#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
1/1/2000,1.,2.,3.
1/2/2000,4,5,6
1/3/2000,7,8,9
"""
data = parser.read_csv(
StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True
)
index = Index(
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
)
expected = DataFrame(
np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
)
tm.assert_frame_equal(data, expected)
@pytest.mark.parametrize(
"data,kwargs,expected",
[
(
"""id,text,num_lines
1,"line 11
line 12",2
2,"line 21
line 22",2
3,"line 31",1""",
{"skiprows": [1]},
DataFrame(
[[2, "line 21\nline 22", 2], [3, "line 31", 1]],
columns=["id", "text", "num_lines"],
),
),
(
"a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~",
{"quotechar": "~", "skiprows": [2]},
DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]),
),
(
(
"Text,url\n~example\n "
"sentence\n one~,url1\n~"
"example\n sentence\n two~,url2\n~"
"example\n sentence\n three~,url3"
),
{"quotechar": "~", "skiprows": [1, 3]},
DataFrame([["example\n sentence\n two", "url2"]], columns=["Text", "url"]),
),
],
)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
def test_skip_row_with_newline(all_parsers, data, kwargs, expected):
# see gh-12775 and gh-10911
parser = all_parsers
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
def test_skip_row_with_quote(all_parsers):
# see gh-12775 and gh-10911
parser = all_parsers
data = """id,text,num_lines
1,"line '11' line 12",2
2,"line '21' line 22",2
3,"line '31' line 32",1"""
exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]]
expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
result = parser.read_csv(StringIO(data), skiprows=[1])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,exp_data",
[
(
"""id,text,num_lines
1,"line \n'11' line 12",2
2,"line \n'21' line 22",2
3,"line \n'31' line 32",1""",
[[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]],
),
(
"""id,text,num_lines
1,"line '11\n' line 12",2
2,"line '21\n' line 22",2
3,"line '31\n' line 32",1""",
[[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]],
),
(
"""id,text,num_lines
1,"line '11\n' \r\tline 12",2
2,"line '21\n' \r\tline 22",2
3,"line '31\n' \r\tline 32",1""",
[[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]],
),
],
)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data):
# see gh-12775 and gh-10911
parser = all_parsers
result = parser.read_csv(StringIO(data), skiprows=[1])
expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: The 'delim_whitespace' option is not supported
@pytest.mark.parametrize(
"lineterminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR"
)
def test_skiprows_lineterminator(all_parsers, lineterminator, request):
# see gh-9079
parser = all_parsers
data = "\n".join(
[
"SMOSMANIA ThetaProbe-ML2X ",
"2007/01/01 01:00 0.2140 U M ",
"2007/01/01 02:00 0.2141 M O ",
"2007/01/01 04:00 0.2142 D M ",
]
)
expected = DataFrame(
[
["2007/01/01", "01:00", 0.2140, "U", "M"],
["2007/01/01", "02:00", 0.2141, "M", "O"],
["2007/01/01", "04:00", 0.2142, "D", "M"],
],
columns=["date", "time", "var", "flag", "oflag"],
)
if parser.engine == "python" and lineterminator == "\r":
mark = pytest.mark.xfail(reason="'CR' not respect with the Python parser yet")
request.applymarker(mark)
data = data.replace("\n", lineterminator)
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(
StringIO(data),
skiprows=1,
delim_whitespace=True,
names=["date", "time", "var", "flag", "oflag"],
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # AssertionError: DataFrame are different
def test_skiprows_infield_quote(all_parsers):
# see gh-14459
parser = all_parsers
data = 'a"\nb"\na\n1'
expected = DataFrame({"a": [1]})
result = parser.read_csv(StringIO(data), skiprows=2)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
@pytest.mark.parametrize(
"kwargs,expected",
[
({}, DataFrame({"1": [3, 5]})),
({"header": 0, "names": ["foo"]}, DataFrame({"foo": [3, 5]})),
],
)
def test_skip_rows_callable(all_parsers, kwargs, expected):
parser = all_parsers
data = "a\n1\n2\n3\n4\n5"
result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
def test_skip_rows_callable_not_in(all_parsers):
parser = all_parsers
data = "0,a\n1,b\n2,c\n3,d\n4,e"
expected = DataFrame([[1, "b"], [3, "d"]])
result = parser.read_csv(
StringIO(data), header=None, skiprows=lambda x: x not in [1, 3]
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
def test_skip_rows_skip_all(all_parsers):
parser = all_parsers
data = "a\n1\n2\n3\n4\n5"
msg = "No columns to parse from file"
with pytest.raises(EmptyDataError, match=msg):
parser.read_csv(StringIO(data), skiprows=lambda x: True)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
def test_skip_rows_bad_callable(all_parsers):
msg = "by zero"
parser = all_parsers
data = "a\n1\n2\n3\n4\n5"
with pytest.raises(ZeroDivisionError, match=msg):
parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
def test_skip_rows_and_n_rows(all_parsers):
# GH#44021
data = """a,b
1,a
2,b
3,c
4,d
5,e
6,f
7,g
8,h
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6])
expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]})
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_skip_rows_with_chunks(all_parsers):
# GH 55677
data = """col_a
10
20
30
40
50
60
70
80
90
100
"""
parser = all_parsers
reader = parser.read_csv(
StringIO(data), engine=parser, skiprows=lambda x: x in [1, 4, 5], chunksize=4
)
df1 = next(reader)
df2 = next(reader)
tm.assert_frame_equal(df1, DataFrame({"col_a": [20, 30, 60, 70]}))
tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6]))

View File

@ -0,0 +1,342 @@
"""
Tests the TextReader class in parsers.pyx, which
is integral to the C engine in parsers.py
"""
from io import (
BytesIO,
StringIO,
)
import numpy as np
import pytest
import pandas._libs.parsers as parser
from pandas._libs.parsers import TextReader
from pandas.errors import ParserWarning
from pandas import DataFrame
import pandas._testing as tm
from pandas.io.parsers import (
TextFileReader,
read_csv,
)
from pandas.io.parsers.c_parser_wrapper import ensure_dtype_objs
class TestTextReader:
@pytest.fixture
def csv_path(self, datapath):
return datapath("io", "data", "csv", "test1.csv")
def test_file_handle(self, csv_path):
with open(csv_path, "rb") as f:
reader = TextReader(f)
reader.read()
def test_file_handle_mmap(self, csv_path):
# this was never using memory_map=True
with open(csv_path, "rb") as f:
reader = TextReader(f, header=None)
reader.read()
def test_StringIO(self, csv_path):
with open(csv_path, "rb") as f:
text = f.read()
src = BytesIO(text)
reader = TextReader(src, header=None)
reader.read()
def test_string_factorize(self):
# should this be optional?
data = "a\nb\na\nb\na"
reader = TextReader(StringIO(data), header=None)
result = reader.read()
assert len(set(map(id, result[0]))) == 2
def test_skipinitialspace(self):
data = "a, b\na, b\na, b\na, b"
reader = TextReader(StringIO(data), skipinitialspace=True, header=None)
result = reader.read()
tm.assert_numpy_array_equal(
result[0], np.array(["a", "a", "a", "a"], dtype=np.object_)
)
tm.assert_numpy_array_equal(
result[1], np.array(["b", "b", "b", "b"], dtype=np.object_)
)
def test_parse_booleans(self):
data = "True\nFalse\nTrue\nTrue"
reader = TextReader(StringIO(data), header=None)
result = reader.read()
assert result[0].dtype == np.bool_
def test_delimit_whitespace(self):
data = 'a b\na\t\t "b"\n"a"\t \t b'
reader = TextReader(StringIO(data), delim_whitespace=True, header=None)
result = reader.read()
tm.assert_numpy_array_equal(
result[0], np.array(["a", "a", "a"], dtype=np.object_)
)
tm.assert_numpy_array_equal(
result[1], np.array(["b", "b", "b"], dtype=np.object_)
)
def test_embedded_newline(self):
data = 'a\n"hello\nthere"\nthis'
reader = TextReader(StringIO(data), header=None)
result = reader.read()
expected = np.array(["a", "hello\nthere", "this"], dtype=np.object_)
tm.assert_numpy_array_equal(result[0], expected)
def test_euro_decimal(self):
data = "12345,67\n345,678"
reader = TextReader(StringIO(data), delimiter=":", decimal=",", header=None)
result = reader.read()
expected = np.array([12345.67, 345.678])
tm.assert_almost_equal(result[0], expected)
def test_integer_thousands(self):
data = "123,456\n12,500"
reader = TextReader(StringIO(data), delimiter=":", thousands=",", header=None)
result = reader.read()
expected = np.array([123456, 12500], dtype=np.int64)
tm.assert_almost_equal(result[0], expected)
def test_integer_thousands_alt(self):
data = "123.456\n12.500"
reader = TextFileReader(
StringIO(data), delimiter=":", thousands=".", header=None
)
result = reader.read()
expected = DataFrame([123456, 12500])
tm.assert_frame_equal(result, expected)
def test_skip_bad_lines(self):
# too many lines, see #2430 for why
data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r"
reader = TextReader(StringIO(data), delimiter=":", header=None)
msg = r"Error tokenizing data\. C error: Expected 3 fields in line 4, saw 4"
with pytest.raises(parser.ParserError, match=msg):
reader.read()
reader = TextReader(
StringIO(data), delimiter=":", header=None, on_bad_lines=2 # Skip
)
result = reader.read()
expected = {
0: np.array(["a", "d", "g", "l"], dtype=object),
1: np.array(["b", "e", "h", "m"], dtype=object),
2: np.array(["c", "f", "i", "n"], dtype=object),
}
assert_array_dicts_equal(result, expected)
with tm.assert_produces_warning(ParserWarning, match="Skipping line"):
reader = TextReader(
StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn
)
reader.read()
def test_header_not_enough_lines(self):
data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6"
reader = TextReader(StringIO(data), delimiter=",", header=2)
header = reader.header
expected = [["a", "b", "c"]]
assert header == expected
recs = reader.read()
expected = {
0: np.array([1, 4], dtype=np.int64),
1: np.array([2, 5], dtype=np.int64),
2: np.array([3, 6], dtype=np.int64),
}
assert_array_dicts_equal(recs, expected)
def test_escapechar(self):
data = '\\"hello world"\n\\"hello world"\n\\"hello world"'
reader = TextReader(StringIO(data), delimiter=",", header=None, escapechar="\\")
result = reader.read()
expected = {0: np.array(['"hello world"'] * 3, dtype=object)}
assert_array_dicts_equal(result, expected)
def test_eof_has_eol(self):
# handling of new line at EOF
pass
def test_na_substitution(self):
pass
def test_numpy_string_dtype(self):
data = """\
a,1
aa,2
aaa,3
aaaa,4
aaaaa,5"""
def _make_reader(**kwds):
if "dtype" in kwds:
kwds["dtype"] = ensure_dtype_objs(kwds["dtype"])
return TextReader(StringIO(data), delimiter=",", header=None, **kwds)
reader = _make_reader(dtype="S5,i4")
result = reader.read()
assert result[0].dtype == "S5"
ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaaa"], dtype="S5")
assert (result[0] == ex_values).all()
assert result[1].dtype == "i4"
reader = _make_reader(dtype="S4")
result = reader.read()
assert result[0].dtype == "S4"
ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaa"], dtype="S4")
assert (result[0] == ex_values).all()
assert result[1].dtype == "S4"
def test_pass_dtype(self):
data = """\
one,two
1,a
2,b
3,c
4,d"""
def _make_reader(**kwds):
if "dtype" in kwds:
kwds["dtype"] = ensure_dtype_objs(kwds["dtype"])
return TextReader(StringIO(data), delimiter=",", **kwds)
reader = _make_reader(dtype={"one": "u1", 1: "S1"})
result = reader.read()
assert result[0].dtype == "u1"
assert result[1].dtype == "S1"
reader = _make_reader(dtype={"one": np.uint8, 1: object})
result = reader.read()
assert result[0].dtype == "u1"
assert result[1].dtype == "O"
reader = _make_reader(dtype={"one": np.dtype("u1"), 1: np.dtype("O")})
result = reader.read()
assert result[0].dtype == "u1"
assert result[1].dtype == "O"
def test_usecols(self):
data = """\
a,b,c
1,2,3
4,5,6
7,8,9
10,11,12"""
def _make_reader(**kwds):
return TextReader(StringIO(data), delimiter=",", **kwds)
reader = _make_reader(usecols=(1, 2))
result = reader.read()
exp = _make_reader().read()
assert len(result) == 2
assert (result[1] == exp[1]).all()
assert (result[2] == exp[2]).all()
@pytest.mark.parametrize(
"text, kwargs",
[
("a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12", {"delimiter": ","}),
(
"a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12",
{"delim_whitespace": True},
),
("a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12", {"delimiter": ","}),
(
(
"A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r"
"AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r"
",BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0"
),
{"delimiter": ","},
),
("A B C\r 2 3\r4 5 6", {"delim_whitespace": True}),
("A B C\r2 3\r4 5 6", {"delim_whitespace": True}),
],
)
def test_cr_delimited(self, text, kwargs):
nice_text = text.replace("\r", "\r\n")
result = TextReader(StringIO(text), **kwargs).read()
expected = TextReader(StringIO(nice_text), **kwargs).read()
assert_array_dicts_equal(result, expected)
def test_empty_field_eof(self):
data = "a,b,c\n1,2,3\n4,,"
result = TextReader(StringIO(data), delimiter=",").read()
expected = {
0: np.array([1, 4], dtype=np.int64),
1: np.array(["2", ""], dtype=object),
2: np.array(["3", ""], dtype=object),
}
assert_array_dicts_equal(result, expected)
@pytest.mark.parametrize("repeat", range(10))
def test_empty_field_eof_mem_access_bug(self, repeat):
# GH5664
a = DataFrame([["b"], [np.nan]], columns=["a"], index=["a", "c"])
b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1])
c = DataFrame(
[
[1, 2, 3, 4],
[6, np.nan, np.nan, np.nan],
[8, 9, 10, 11],
[13, 14, np.nan, np.nan],
],
columns=list("abcd"),
index=[0, 5, 7, 12],
)
df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c")
tm.assert_frame_equal(df, a)
df = read_csv(
StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c"
)
tm.assert_frame_equal(df, b)
df = read_csv(
StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"),
names=list("abcd"),
engine="c",
)
tm.assert_frame_equal(df, c)
def test_empty_csv_input(self):
# GH14867
with read_csv(
StringIO(), chunksize=20, header=None, names=["a", "b", "c"]
) as df:
assert isinstance(df, TextFileReader)
def assert_array_dicts_equal(left, right):
for k, v in left.items():
tm.assert_numpy_array_equal(np.asarray(v), np.asarray(right[k]))

View File

@ -0,0 +1,226 @@
"""
Tests that features that are currently unsupported in
either the Python or C parser are actually enforced
and are clearly communicated to the user.
Ultimately, the goal is to remove test cases from this
test suite as new feature support is added to the parsers.
"""
from io import StringIO
import os
from pathlib import Path
import pytest
from pandas.errors import ParserError
import pandas._testing as tm
from pandas.io.parsers import read_csv
import pandas.io.parsers.readers as parsers
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val)
def python_engine(request):
return request.param
class TestUnsupportedFeatures:
def test_mangle_dupe_cols_false(self):
# see gh-12935
data = "a b c\n1 2 3"
for engine in ("c", "python"):
with pytest.raises(TypeError, match="unexpected keyword"):
read_csv(StringIO(data), engine=engine, mangle_dupe_cols=True)
def test_c_engine(self):
# see gh-6607
data = "a b c\n1 2 3"
msg = "does not support"
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
# specify C engine with unsupported options (raise)
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False)
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine="c", sep=r"\s")
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine="c", sep="\t", quotechar=chr(128))
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine="c", skipfooter=1)
# specify C-unsupported options without python-unsupported options
with tm.assert_produces_warning((parsers.ParserWarning, FutureWarning)):
read_csv(StringIO(data), sep=None, delim_whitespace=False)
with tm.assert_produces_warning(parsers.ParserWarning):
read_csv(StringIO(data), sep=r"\s")
with tm.assert_produces_warning(parsers.ParserWarning):
read_csv(StringIO(data), sep="\t", quotechar=chr(128))
with tm.assert_produces_warning(parsers.ParserWarning):
read_csv(StringIO(data), skipfooter=1)
text = """ A B C D E
one two three four
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
msg = "Error tokenizing data"
with pytest.raises(ParserError, match=msg):
read_csv(StringIO(text), sep="\\s+")
with pytest.raises(ParserError, match=msg):
read_csv(StringIO(text), engine="c", sep="\\s+")
msg = "Only length-1 thousands markers supported"
data = """A|B|C
1|2,334|5
10|13|10.
"""
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), thousands=",,")
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), thousands="")
msg = "Only length-1 line terminators supported"
data = "a,b,c~~1,2,3~~4,5,6"
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), lineterminator="~~")
def test_python_engine(self, python_engine):
from pandas.io.parsers.readers import _python_unsupported as py_unsupported
data = """1,2,3,,
1,2,3,4,
1,2,3,4,5
1,2,,,
1,2,3,4,"""
for default in py_unsupported:
msg = (
f"The {repr(default)} option is not "
f"supported with the {repr(python_engine)} engine"
)
kwargs = {default: object()}
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine=python_engine, **kwargs)
def test_python_engine_file_no_iter(self, python_engine):
# see gh-16530
class NoNextBuffer:
def __init__(self, csv_data) -> None:
self.data = csv_data
def __next__(self):
return self.data.__next__()
def read(self):
return self.data
def readline(self):
return self.data
data = "a\n1"
msg = "'NoNextBuffer' object is not iterable|argument 1 must be an iterator"
with pytest.raises(TypeError, match=msg):
read_csv(NoNextBuffer(data), engine=python_engine)
def test_pyarrow_engine(self):
from pandas.io.parsers.readers import _pyarrow_unsupported as pa_unsupported
data = """1,2,3,,
1,2,3,4,
1,2,3,4,5
1,2,,,
1,2,3,4,"""
for default in pa_unsupported:
msg = (
f"The {repr(default)} option is not "
f"supported with the 'pyarrow' engine"
)
kwargs = {default: object()}
default_needs_bool = {"warn_bad_lines", "error_bad_lines"}
if default == "dialect":
kwargs[default] = "excel" # test a random dialect
elif default in default_needs_bool:
kwargs[default] = True
elif default == "on_bad_lines":
kwargs[default] = "warn"
warn = None
depr_msg = None
if "delim_whitespace" in kwargs:
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
warn = FutureWarning
if "verbose" in kwargs:
depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated"
warn = FutureWarning
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(warn, match=depr_msg):
read_csv(StringIO(data), engine="pyarrow", **kwargs)
def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers):
# GH 5686
# GH 54643
sio = StringIO("a,b\n1,2")
bad_lines_func = lambda x: x
parser = all_parsers
if all_parsers.engine not in ["python", "pyarrow"]:
msg = (
"on_bad_line can only be a callable "
"function if engine='python' or 'pyarrow'"
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(sio, on_bad_lines=bad_lines_func)
else:
parser.read_csv(sio, on_bad_lines=bad_lines_func)
def test_close_file_handle_on_invalid_usecols(all_parsers):
# GH 45384
parser = all_parsers
error = ValueError
if parser.engine == "pyarrow":
# Raises pyarrow.lib.ArrowKeyError
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
with tm.ensure_clean("test.csv") as fname:
Path(fname).write_text("col1,col2\na,b\n1,2", encoding="utf-8")
with tm.assert_produces_warning(False):
with pytest.raises(error, match="col3"):
parser.read_csv(fname, usecols=["col1", "col2", "col3"])
# unlink fails on windows if file handles still point to it
os.unlink(fname)
def test_invalid_file_inputs(request, all_parsers):
# GH#45957
parser = all_parsers
if parser.engine == "python":
request.applymarker(
pytest.mark.xfail(reason=f"{parser.engine} engine supports lists.")
)
with pytest.raises(ValueError, match="Invalid"):
parser.read_csv([])
def test_invalid_dtype_backend(all_parsers):
parser = all_parsers
msg = (
"dtype_backend numpy is invalid, only 'numpy_nullable' and "
"'pyarrow' are allowed."
)
with pytest.raises(ValueError, match=msg):
parser.read_csv("test", dtype_backend="numpy")

View File

@ -0,0 +1,102 @@
import numpy as np
import pytest
from pandas._libs.parsers import (
_maybe_upcast,
na_values,
)
import pandas as pd
from pandas import NA
import pandas._testing as tm
from pandas.core.arrays import (
ArrowStringArray,
BooleanArray,
FloatingArray,
IntegerArray,
StringArray,
)
def test_maybe_upcast(any_real_numpy_dtype):
# GH#36712
dtype = np.dtype(any_real_numpy_dtype)
na_value = na_values[dtype]
arr = np.array([1, 2, na_value], dtype=dtype)
result = _maybe_upcast(arr, use_dtype_backend=True)
expected_mask = np.array([False, False, True])
if issubclass(dtype.type, np.integer):
expected = IntegerArray(arr, mask=expected_mask)
else:
expected = FloatingArray(arr, mask=expected_mask)
tm.assert_extension_array_equal(result, expected)
def test_maybe_upcast_no_na(any_real_numpy_dtype):
# GH#36712
arr = np.array([1, 2, 3], dtype=any_real_numpy_dtype)
result = _maybe_upcast(arr, use_dtype_backend=True)
expected_mask = np.array([False, False, False])
if issubclass(np.dtype(any_real_numpy_dtype).type, np.integer):
expected = IntegerArray(arr, mask=expected_mask)
else:
expected = FloatingArray(arr, mask=expected_mask)
tm.assert_extension_array_equal(result, expected)
def test_maybe_upcaste_bool():
# GH#36712
dtype = np.bool_
na_value = na_values[dtype]
arr = np.array([True, False, na_value], dtype="uint8").view(dtype)
result = _maybe_upcast(arr, use_dtype_backend=True)
expected_mask = np.array([False, False, True])
expected = BooleanArray(arr, mask=expected_mask)
tm.assert_extension_array_equal(result, expected)
def test_maybe_upcaste_bool_no_nan():
# GH#36712
dtype = np.bool_
arr = np.array([True, False, False], dtype="uint8").view(dtype)
result = _maybe_upcast(arr, use_dtype_backend=True)
expected_mask = np.array([False, False, False])
expected = BooleanArray(arr, mask=expected_mask)
tm.assert_extension_array_equal(result, expected)
def test_maybe_upcaste_all_nan():
# GH#36712
dtype = np.int64
na_value = na_values[dtype]
arr = np.array([na_value, na_value], dtype=dtype)
result = _maybe_upcast(arr, use_dtype_backend=True)
expected_mask = np.array([True, True])
expected = IntegerArray(arr, mask=expected_mask)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("val", [na_values[np.object_], "c"])
def test_maybe_upcast_object(val, string_storage):
# GH#36712
pa = pytest.importorskip("pyarrow")
with pd.option_context("mode.string_storage", string_storage):
arr = np.array(["a", "b", val], dtype=np.object_)
result = _maybe_upcast(arr, use_dtype_backend=True)
if string_storage == "python":
exp_val = "c" if val == "c" else NA
expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_))
else:
exp_val = "c" if val == "c" else None
expected = ArrowStringArray(pa.array(["a", "b", exp_val]))
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,194 @@
"""
Tests the usecols functionality during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import pytest
from pandas import (
DataFrame,
Index,
Timestamp,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
_msg_pyarrow_requires_names = (
"The pyarrow engine does not allow 'usecols' to be integer column "
"positions. Pass a list of string column names instead."
)
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
def test_usecols_with_parse_dates(all_parsers, usecols):
# see gh-9755
data = """a,b,c,d,e
0,1,2014-01-01,09:00,4
0,1,2014-01-02,10:00,4"""
parser = all_parsers
parse_dates = [[1, 2]]
depr_msg = (
"Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
)
cols = {
"a": [0, 0],
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
}
expected = DataFrame(cols, columns=["c_d", "a"])
if parser.engine == "pyarrow":
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(
StringIO(data), usecols=usecols, parse_dates=parse_dates
)
return
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(
StringIO(data), usecols=usecols, parse_dates=parse_dates
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # pyarrow.lib.ArrowKeyError: Column 'fdate' in include_columns
def test_usecols_with_parse_dates2(all_parsers):
# see gh-13604
parser = all_parsers
data = """2008-02-07 09:40,1032.43
2008-02-07 09:50,1042.54
2008-02-07 10:00,1051.65"""
names = ["date", "values"]
usecols = names[:]
parse_dates = [0]
index = Index(
[
Timestamp("2008-02-07 09:40"),
Timestamp("2008-02-07 09:50"),
Timestamp("2008-02-07 10:00"),
],
name="date",
)
cols = {"values": [1032.43, 1042.54, 1051.65]}
expected = DataFrame(cols, index=index)
result = parser.read_csv(
StringIO(data),
parse_dates=parse_dates,
index_col=0,
usecols=usecols,
header=None,
names=names,
)
tm.assert_frame_equal(result, expected)
def test_usecols_with_parse_dates3(all_parsers):
# see gh-14792
parser = all_parsers
data = """a,b,c,d,e,f,g,h,i,j
2016/09/21,1,1,2,3,4,5,6,7,8"""
usecols = list("abcdefghij")
parse_dates = [0]
cols = {
"a": Timestamp("2016-09-21").as_unit("ns"),
"b": [1],
"c": [1],
"d": [2],
"e": [3],
"f": [4],
"g": [5],
"h": [6],
"i": [7],
"j": [8],
}
expected = DataFrame(cols, columns=usecols)
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
tm.assert_frame_equal(result, expected)
def test_usecols_with_parse_dates4(all_parsers):
data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
usecols = list("abcdefghij")
parse_dates = [[0, 1]]
parser = all_parsers
cols = {
"a_b": "2016/09/21 1",
"c": [1],
"d": [2],
"e": [3],
"f": [4],
"g": [5],
"h": [6],
"i": [7],
"j": [8],
}
expected = DataFrame(cols, columns=["a_b"] + list("cdefghij"))
depr_msg = (
"Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
)
with tm.assert_produces_warning(
(FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(
StringIO(data),
usecols=usecols,
parse_dates=parse_dates,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
@pytest.mark.parametrize(
"names",
[
list("abcde"), # Names span all columns in original data.
list("acd"), # Names span only the selected columns.
],
)
def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names, request):
# see gh-9755
s = """0,1,2014-01-01,09:00,4
0,1,2014-01-02,10:00,4"""
parse_dates = [[1, 2]]
parser = all_parsers
if parser.engine == "pyarrow" and not (len(names) == 3 and usecols[0] == 0):
mark = pytest.mark.xfail(
reason="Length mismatch in some cases, UserWarning in other"
)
request.applymarker(mark)
cols = {
"a": [0, 0],
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
}
expected = DataFrame(cols, columns=["c_d", "a"])
depr_msg = (
"Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
)
with tm.assert_produces_warning(
(FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(
StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,96 @@
"""
Tests the usecols functionality during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import pytest
from pandas import DataFrame
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
def test_usecols_with_unicode_strings(all_parsers):
# see gh-13219
data = """AAA,BBB,CCC,DDD
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
exp_data = {
"AAA": {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002,
},
"BBB": {0: 8, 1: 2, 2: 7},
}
expected = DataFrame(exp_data)
result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"])
tm.assert_frame_equal(result, expected)
def test_usecols_with_single_byte_unicode_strings(all_parsers):
# see gh-13219
data = """A,B,C,D
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
exp_data = {
"A": {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002,
},
"B": {0: 8, 1: 2, 2: 7},
}
expected = DataFrame(exp_data)
result = parser.read_csv(StringIO(data), usecols=["A", "B"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]])
def test_usecols_with_mixed_encoding_strings(all_parsers, usecols):
data = """AAA,BBB,CCC,DDD
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
_msg_validate_usecols_arg = (
"'usecols' must either be list-like "
"of all strings, all unicode, all "
"integers or a callable."
)
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
parser.read_csv(StringIO(data), usecols=usecols)
@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]])
def test_usecols_with_multi_byte_characters(all_parsers, usecols):
data = """あああ,いい,ううう,ええええ
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
exp_data = {
"あああ": {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002,
},
"いい": {0: 8, 1: 2, 2: 7},
}
expected = DataFrame(exp_data)
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,563 @@
"""
Tests the usecols functionality during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas.errors import ParserError
from pandas import (
DataFrame,
Index,
array,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
_msg_validate_usecols_arg = (
"'usecols' must either be list-like "
"of all strings, all unicode, all "
"integers or a callable."
)
_msg_validate_usecols_names = (
"Usecols do not match columns, columns expected but not found: {0}"
)
_msg_pyarrow_requires_names = (
"The pyarrow engine does not allow 'usecols' to be integer column "
"positions. Pass a list of string column names instead."
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning"
)
def test_raise_on_mixed_dtype_usecols(all_parsers):
# See gh-12678
data = """a,b,c
1000,2000,3000
4000,5000,6000
"""
usecols = [0, "b", 2]
parser = all_parsers
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
parser.read_csv(StringIO(data), usecols=usecols)
@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")])
def test_usecols(all_parsers, usecols, request):
data = """\
a,b,c
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
parser.read_csv(StringIO(data), usecols=usecols)
return
result = parser.read_csv(StringIO(data), usecols=usecols)
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
tm.assert_frame_equal(result, expected)
def test_usecols_with_names(all_parsers):
data = """\
a,b,c
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
names = ["foo", "bar"]
if parser.engine == "pyarrow":
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
return
result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])]
)
def test_usecols_relative_to_names(all_parsers, names, usecols):
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
if parser.engine == "pyarrow" and not isinstance(usecols[0], int):
# ArrowKeyError: Column 'fb' in include_columns does not exist
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols)
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
tm.assert_frame_equal(result, expected)
def test_usecols_relative_to_names2(all_parsers):
# see gh-5766
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
result = parser.read_csv(
StringIO(data), names=["a", "b"], header=None, usecols=[0, 1]
)
expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
# regex mismatch: "Length mismatch: Expected axis has 1 elements"
@xfail_pyarrow
def test_usecols_name_length_conflict(all_parsers):
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
msg = "Number of passed names did not match number of header fields in the file"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1])
def test_usecols_single_string(all_parsers):
# see gh-20558
parser = all_parsers
data = """foo, bar, baz
1000, 2000, 3000
4000, 5000, 6000"""
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
parser.read_csv(StringIO(data), usecols="foo")
@skip_pyarrow # CSV parse error in one case, AttributeError in another
@pytest.mark.parametrize(
"data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"]
)
def test_usecols_index_col_false(all_parsers, data):
# see gh-9082
parser = all_parsers
usecols = ["a", "c", "d"]
expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]})
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("index_col", ["b", 0])
@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]])
def test_usecols_index_col_conflict(all_parsers, usecols, index_col, request):
# see gh-4201: test that index_col as integer reflects usecols
parser = all_parsers
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
return
expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b"))
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
tm.assert_frame_equal(result, expected)
def test_usecols_index_col_conflict2(all_parsers):
# see gh-4201: test that index_col as integer reflects usecols
parser = all_parsers
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")})
expected = expected.set_index(["b", "c"])
result = parser.read_csv(
StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"]
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Expected 3 columns, got 4
def test_usecols_implicit_index_col(all_parsers):
# see gh-2654
parser = all_parsers
data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"
result = parser.read_csv(StringIO(data), usecols=["a", "b"])
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
tm.assert_frame_equal(result, expected)
def test_usecols_index_col_middle(all_parsers):
# GH#9098
parser = all_parsers
data = """a,b,c,d
1,2,3,4
"""
result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="c")
expected = DataFrame({"b": [2], "d": [4]}, index=Index([3], name="c"))
tm.assert_frame_equal(result, expected)
def test_usecols_index_col_end(all_parsers):
# GH#9098
parser = all_parsers
data = """a,b,c,d
1,2,3,4
"""
result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="d")
expected = DataFrame({"b": [2], "c": [3]}, index=Index([4], name="d"))
tm.assert_frame_equal(result, expected)
def test_usecols_regex_sep(all_parsers):
# see gh-2733
parser = all_parsers
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
if parser.engine == "pyarrow":
msg = "the 'pyarrow' engine does not support regex separators"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
return
result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
tm.assert_frame_equal(result, expected)
def test_usecols_with_whitespace(all_parsers):
parser = all_parsers
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
if parser.engine == "pyarrow":
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(
StringIO(data), delim_whitespace=True, usecols=("a", "b")
)
return
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(
StringIO(data), delim_whitespace=True, usecols=("a", "b")
)
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"usecols,expected",
[
# Column selection by index.
([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])),
# Column selection by name.
(
["0", "1"],
DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]),
),
],
)
def test_usecols_with_integer_like_header(all_parsers, usecols, expected, request):
parser = all_parsers
data = """2,0,1
1000,2000,3000
4000,5000,6000"""
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
parser.read_csv(StringIO(data), usecols=usecols)
return
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # mismatched shape
def test_empty_usecols(all_parsers):
data = "a,b,c\n1,2,3\n4,5,6"
expected = DataFrame(columns=Index([]))
parser = all_parsers
result = parser.read_csv(StringIO(data), usecols=set())
tm.assert_frame_equal(result, expected)
def test_np_array_usecols(all_parsers):
# see gh-12546
parser = all_parsers
data = "a,b,c\n1,2,3"
usecols = np.array(["a", "b"])
expected = DataFrame([[1, 2]], columns=usecols)
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"usecols,expected",
[
(
lambda x: x.upper() in ["AAA", "BBB", "DDD"],
DataFrame(
{
"AaA": {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002,
},
"bBb": {0: 8, 1: 2, 2: 7},
"ddd": {0: "a", 1: "b", 2: "a"},
}
),
),
(lambda x: False, DataFrame(columns=Index([]))),
],
)
def test_callable_usecols(all_parsers, usecols, expected):
# see gh-14154
data = """AaA,bBb,CCC,ddd
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
if parser.engine == "pyarrow":
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), usecols=usecols)
return
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
# ArrowKeyError: Column 'fa' in include_columns does not exist in CSV file
@skip_pyarrow
@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
def test_incomplete_first_row(all_parsers, usecols):
# see gh-6710
data = "1,2\n1,2,3"
parser = all_parsers
names = ["a", "b", "c"]
expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]})
result = parser.read_csv(StringIO(data), names=names, usecols=usecols)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Expected 3 columns, got 4
@pytest.mark.parametrize(
"data,usecols,kwargs,expected",
[
# see gh-8985
(
"19,29,39\n" * 2 + "10,20,30,40",
[0, 1, 2],
{"header": None},
DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]),
),
# see gh-9549
(
("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"),
["A", "B", "C"],
{},
DataFrame(
{
"A": [1, 3, 1, 1, 1, 5],
"B": [2, 4, 2, 2, 2, 6],
"C": [3, 5, 4, 3, 3, 7],
}
),
),
],
)
def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected):
# see gh-8985
parser = all_parsers
result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"usecols,kwargs,expected,msg",
[
(
["a", "b", "c", "d"],
{},
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
None,
),
(
["a", "b", "c", "f"],
{},
None,
_msg_validate_usecols_names.format(r"\['f'\]"),
),
(["a", "b", "f"], {}, None, _msg_validate_usecols_names.format(r"\['f'\]")),
(
["a", "b", "f", "g"],
{},
None,
_msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"),
),
# see gh-14671
(
None,
{"header": 0, "names": ["A", "B", "C", "D"]},
DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}),
None,
),
(
["A", "B", "C", "f"],
{"header": 0, "names": ["A", "B", "C", "D"]},
None,
_msg_validate_usecols_names.format(r"\['f'\]"),
),
(
["A", "B", "f"],
{"names": ["A", "B", "C", "D"]},
None,
_msg_validate_usecols_names.format(r"\['f'\]"),
),
],
)
def test_raises_on_usecols_names_mismatch(
all_parsers, usecols, kwargs, expected, msg, request
):
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
kwargs.update(usecols=usecols)
parser = all_parsers
if parser.engine == "pyarrow" and not (
usecols is not None and expected is not None
):
# everything but the first case
# ArrowKeyError: Column 'f' in include_columns does not exist in CSV file
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
if expected is None:
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
else:
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request):
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
names = ["A", "B", "C", "D"]
parser = all_parsers
if parser.engine == "pyarrow":
if isinstance(usecols[0], int):
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
return
# "pyarrow.lib.ArrowKeyError: Column 'A' in include_columns does not exist"
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
expected = DataFrame({"A": [1, 5], "C": [3, 7]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("names", [None, ["a", "b"]])
def test_usecols_indices_out_of_bounds(all_parsers, names):
# GH#25623 & GH 41130; enforced in 2.0
parser = all_parsers
data = """
a,b
1,2
"""
err = ParserError
msg = "Defining usecols with out-of-bounds"
if parser.engine == "pyarrow":
err = ValueError
msg = _msg_pyarrow_requires_names
with pytest.raises(err, match=msg):
parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0)
def test_usecols_additional_columns(all_parsers):
# GH#46997
parser = all_parsers
usecols = lambda header: header.strip() in ["a", "b", "c"]
if parser.engine == "pyarrow":
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols)
return
result = parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols)
expected = DataFrame({"a": ["x"], "b": "y"})
tm.assert_frame_equal(result, expected)
def test_usecols_additional_columns_integer_columns(all_parsers):
# GH#46997
parser = all_parsers
usecols = lambda header: header.strip() in ["0", "1"]
if parser.engine == "pyarrow":
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols)
return
result = parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols)
expected = DataFrame({"0": ["x"], "1": "y"})
tm.assert_frame_equal(result, expected)
def test_usecols_dtype(all_parsers):
parser = all_parsers
data = """
col1,col2,col3
a,1,x
b,2,y
"""
result = parser.read_csv(
StringIO(data),
usecols=["col1", "col2"],
dtype={"col1": "string", "col2": "uint8", "col3": "string"},
)
expected = DataFrame(
{"col1": array(["a", "b"]), "col2": np.array([1, 2], dtype="uint8")}
)
tm.assert_frame_equal(result, expected)