I am done

This commit is contained in:
2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions

View File

@ -0,0 +1,7 @@
"""
Test files dedicated to individual (stand-alone) DataFrame methods
Ideally these files/tests should correspond 1-to-1 with tests.series.methods
These may also present opportunities for sharing/de-duplicating test code.
"""

View File

@ -0,0 +1,49 @@
import pytest
from pandas import Index
import pandas._testing as tm
def test_add_prefix_suffix(float_frame):
with_prefix = float_frame.add_prefix("foo#")
expected = Index([f"foo#{c}" for c in float_frame.columns])
tm.assert_index_equal(with_prefix.columns, expected)
with_suffix = float_frame.add_suffix("#foo")
expected = Index([f"{c}#foo" for c in float_frame.columns])
tm.assert_index_equal(with_suffix.columns, expected)
with_pct_prefix = float_frame.add_prefix("%")
expected = Index([f"%{c}" for c in float_frame.columns])
tm.assert_index_equal(with_pct_prefix.columns, expected)
with_pct_suffix = float_frame.add_suffix("%")
expected = Index([f"{c}%" for c in float_frame.columns])
tm.assert_index_equal(with_pct_suffix.columns, expected)
def test_add_prefix_suffix_axis(float_frame):
# GH 47819
with_prefix = float_frame.add_prefix("foo#", axis=0)
expected = Index([f"foo#{c}" for c in float_frame.index])
tm.assert_index_equal(with_prefix.index, expected)
with_prefix = float_frame.add_prefix("foo#", axis=1)
expected = Index([f"foo#{c}" for c in float_frame.columns])
tm.assert_index_equal(with_prefix.columns, expected)
with_pct_suffix = float_frame.add_suffix("#foo", axis=0)
expected = Index([f"{c}#foo" for c in float_frame.index])
tm.assert_index_equal(with_pct_suffix.index, expected)
with_pct_suffix = float_frame.add_suffix("#foo", axis=1)
expected = Index([f"{c}#foo" for c in float_frame.columns])
tm.assert_index_equal(with_pct_suffix.columns, expected)
def test_add_prefix_suffix_invalid_axis(float_frame):
with pytest.raises(ValueError, match="No axis named 2 for object type DataFrame"):
float_frame.add_prefix("foo#", axis=2)
with pytest.raises(ValueError, match="No axis named 2 for object type DataFrame"):
float_frame.add_suffix("foo#", axis=2)

View File

@ -0,0 +1,484 @@
from datetime import timezone
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
Series,
date_range,
)
import pandas._testing as tm
class TestDataFrameAlign:
def test_align_asfreq_method_raises(self):
df = DataFrame({"A": [1, np.nan, 2]})
msg = "Invalid fill method"
msg2 = "The 'method', 'limit', and 'fill_axis' keywords"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match=msg2):
df.align(df.iloc[::-1], method="asfreq")
def test_frame_align_aware(self):
idx1 = date_range("2001", periods=5, freq="h", tz="US/Eastern")
idx2 = date_range("2001", periods=5, freq="2h", tz="US/Eastern")
df1 = DataFrame(np.random.default_rng(2).standard_normal((len(idx1), 3)), idx1)
df2 = DataFrame(np.random.default_rng(2).standard_normal((len(idx2), 3)), idx2)
new1, new2 = df1.align(df2)
assert df1.index.tz == new1.index.tz
assert df2.index.tz == new2.index.tz
# different timezones convert to UTC
# frame with frame
df1_central = df1.tz_convert("US/Central")
new1, new2 = df1.align(df1_central)
assert new1.index.tz is timezone.utc
assert new2.index.tz is timezone.utc
# frame with Series
new1, new2 = df1.align(df1_central[0], axis=0)
assert new1.index.tz is timezone.utc
assert new2.index.tz is timezone.utc
df1[0].align(df1_central, axis=0)
assert new1.index.tz is timezone.utc
assert new2.index.tz is timezone.utc
def test_align_float(self, float_frame, using_copy_on_write):
af, bf = float_frame.align(float_frame)
assert af._mgr is not float_frame._mgr
af, bf = float_frame.align(float_frame, copy=False)
if not using_copy_on_write:
assert af._mgr is float_frame._mgr
else:
assert af._mgr is not float_frame._mgr
# axis = 0
other = float_frame.iloc[:-5, :3]
af, bf = float_frame.align(other, axis=0, fill_value=-1)
tm.assert_index_equal(bf.columns, other.columns)
# test fill value
join_idx = float_frame.index.join(other.index)
diff_a = float_frame.index.difference(join_idx)
diff_a_vals = af.reindex(diff_a).values
assert (diff_a_vals == -1).all()
af, bf = float_frame.align(other, join="right", axis=0)
tm.assert_index_equal(bf.columns, other.columns)
tm.assert_index_equal(bf.index, other.index)
tm.assert_index_equal(af.index, other.index)
# axis = 1
other = float_frame.iloc[:-5, :3].copy()
af, bf = float_frame.align(other, axis=1)
tm.assert_index_equal(bf.columns, float_frame.columns)
tm.assert_index_equal(bf.index, other.index)
# test fill value
join_idx = float_frame.index.join(other.index)
diff_a = float_frame.index.difference(join_idx)
diff_a_vals = af.reindex(diff_a).values
assert (diff_a_vals == -1).all()
af, bf = float_frame.align(other, join="inner", axis=1)
tm.assert_index_equal(bf.columns, other.columns)
msg = (
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
"are deprecated"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
af, bf = float_frame.align(other, join="inner", axis=1, method="pad")
tm.assert_index_equal(bf.columns, other.columns)
msg = (
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
"are deprecated"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
af, bf = float_frame.align(
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None
)
tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype))
msg = (
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
"are deprecated"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
af, bf = float_frame.align(
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0
)
tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype))
# Try to align DataFrame to Series along bad axis
msg = "No axis named 2 for object type DataFrame"
with pytest.raises(ValueError, match=msg):
float_frame.align(af.iloc[0, :3], join="inner", axis=2)
def test_align_frame_with_series(self, float_frame):
# align dataframe to series with broadcast or not
idx = float_frame.index
s = Series(range(len(idx)), index=idx)
left, right = float_frame.align(s, axis=0)
tm.assert_index_equal(left.index, float_frame.index)
tm.assert_index_equal(right.index, float_frame.index)
assert isinstance(right, Series)
msg = "The 'broadcast_axis' keyword in DataFrame.align is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
left, right = float_frame.align(s, broadcast_axis=1)
tm.assert_index_equal(left.index, float_frame.index)
expected = {c: s for c in float_frame.columns}
expected = DataFrame(
expected, index=float_frame.index, columns=float_frame.columns
)
tm.assert_frame_equal(right, expected)
def test_align_series_condition(self):
# see gh-9558
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
result = df[df["a"] == 2]
expected = DataFrame([[2, 5]], index=[1], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
result = df.where(df["a"] == 2, 0)
expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]})
tm.assert_frame_equal(result, expected)
def test_align_int(self, int_frame):
# test other non-float types
other = DataFrame(index=range(5), columns=["A", "B", "C"])
msg = (
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
"are deprecated"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
af, bf = int_frame.align(other, join="inner", axis=1, method="pad")
tm.assert_index_equal(bf.columns, other.columns)
def test_align_mixed_type(self, float_string_frame):
msg = (
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
"are deprecated"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
af, bf = float_string_frame.align(
float_string_frame, join="inner", axis=1, method="pad"
)
tm.assert_index_equal(bf.columns, float_string_frame.columns)
def test_align_mixed_float(self, mixed_float_frame):
# mixed floats/ints
other = DataFrame(index=range(5), columns=["A", "B", "C"])
msg = (
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
"are deprecated"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
af, bf = mixed_float_frame.align(
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0
)
tm.assert_index_equal(bf.index, Index([]))
def test_align_mixed_int(self, mixed_int_frame):
other = DataFrame(index=range(5), columns=["A", "B", "C"])
msg = (
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
"are deprecated"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
af, bf = mixed_int_frame.align(
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0
)
tm.assert_index_equal(bf.index, Index([]))
@pytest.mark.parametrize(
"l_ordered,r_ordered,expected",
[
[True, True, pd.CategoricalIndex],
[True, False, Index],
[False, True, Index],
[False, False, pd.CategoricalIndex],
],
)
def test_align_categorical(self, l_ordered, r_ordered, expected):
# GH-28397
df_1 = DataFrame(
{
"A": np.arange(6, dtype="int64"),
"B": Series(list("aabbca")).astype(
pd.CategoricalDtype(list("cab"), ordered=l_ordered)
),
}
).set_index("B")
df_2 = DataFrame(
{
"A": np.arange(5, dtype="int64"),
"B": Series(list("babca")).astype(
pd.CategoricalDtype(list("cab"), ordered=r_ordered)
),
}
).set_index("B")
aligned_1, aligned_2 = df_1.align(df_2)
assert isinstance(aligned_1.index, expected)
assert isinstance(aligned_2.index, expected)
tm.assert_index_equal(aligned_1.index, aligned_2.index)
def test_align_multiindex(self):
# GH#10665
# same test cases as test_align_multiindex in test_series.py
midx = pd.MultiIndex.from_product(
[range(2), range(3), range(2)], names=("a", "b", "c")
)
idx = Index(range(2), name="b")
df1 = DataFrame(np.arange(12, dtype="int64"), index=midx)
df2 = DataFrame(np.arange(2, dtype="int64"), index=idx)
# these must be the same results (but flipped)
res1l, res1r = df1.align(df2, join="left")
res2l, res2r = df2.align(df1, join="right")
expl = df1
tm.assert_frame_equal(expl, res1l)
tm.assert_frame_equal(expl, res2r)
expr = DataFrame([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx)
tm.assert_frame_equal(expr, res1r)
tm.assert_frame_equal(expr, res2l)
res1l, res1r = df1.align(df2, join="right")
res2l, res2r = df2.align(df1, join="left")
exp_idx = pd.MultiIndex.from_product(
[range(2), range(2), range(2)], names=("a", "b", "c")
)
expl = DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx)
tm.assert_frame_equal(expl, res1l)
tm.assert_frame_equal(expl, res2r)
expr = DataFrame([0, 0, 1, 1] * 2, index=exp_idx)
tm.assert_frame_equal(expr, res1r)
tm.assert_frame_equal(expr, res2l)
def test_align_series_combinations(self):
df = DataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE"))
s = Series([1, 2, 4], index=list("ABD"), name="x")
# frame + series
res1, res2 = df.align(s, axis=0)
exp1 = DataFrame(
{"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]},
index=list("ABCDE"),
)
exp2 = Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x")
tm.assert_frame_equal(res1, exp1)
tm.assert_series_equal(res2, exp2)
# series + frame
res1, res2 = s.align(df)
tm.assert_series_equal(res1, exp2)
tm.assert_frame_equal(res2, exp1)
def test_multiindex_align_to_series_with_common_index_level(self):
# GH-46001
foo_index = Index([1, 2, 3], name="foo")
bar_index = Index([1, 2], name="bar")
series = Series([1, 2], index=bar_index, name="foo_series")
df = DataFrame(
{"col": np.arange(6)},
index=pd.MultiIndex.from_product([foo_index, bar_index]),
)
expected_r = Series([1, 2] * 3, index=df.index, name="foo_series")
result_l, result_r = df.align(series, axis=0)
tm.assert_frame_equal(result_l, df)
tm.assert_series_equal(result_r, expected_r)
def test_multiindex_align_to_series_with_common_index_level_missing_in_left(self):
# GH-46001
foo_index = Index([1, 2, 3], name="foo")
bar_index = Index([1, 2], name="bar")
series = Series(
[1, 2, 3, 4], index=Index([1, 2, 3, 4], name="bar"), name="foo_series"
)
df = DataFrame(
{"col": np.arange(6)},
index=pd.MultiIndex.from_product([foo_index, bar_index]),
)
expected_r = Series([1, 2] * 3, index=df.index, name="foo_series")
result_l, result_r = df.align(series, axis=0)
tm.assert_frame_equal(result_l, df)
tm.assert_series_equal(result_r, expected_r)
def test_multiindex_align_to_series_with_common_index_level_missing_in_right(self):
# GH-46001
foo_index = Index([1, 2, 3], name="foo")
bar_index = Index([1, 2, 3, 4], name="bar")
series = Series([1, 2], index=Index([1, 2], name="bar"), name="foo_series")
df = DataFrame(
{"col": np.arange(12)},
index=pd.MultiIndex.from_product([foo_index, bar_index]),
)
expected_r = Series(
[1, 2, np.nan, np.nan] * 3, index=df.index, name="foo_series"
)
result_l, result_r = df.align(series, axis=0)
tm.assert_frame_equal(result_l, df)
tm.assert_series_equal(result_r, expected_r)
def test_multiindex_align_to_series_with_common_index_level_missing_in_both(self):
# GH-46001
foo_index = Index([1, 2, 3], name="foo")
bar_index = Index([1, 3, 4], name="bar")
series = Series(
[1, 2, 3], index=Index([1, 2, 4], name="bar"), name="foo_series"
)
df = DataFrame(
{"col": np.arange(9)},
index=pd.MultiIndex.from_product([foo_index, bar_index]),
)
expected_r = Series([1, np.nan, 3] * 3, index=df.index, name="foo_series")
result_l, result_r = df.align(series, axis=0)
tm.assert_frame_equal(result_l, df)
tm.assert_series_equal(result_r, expected_r)
def test_multiindex_align_to_series_with_common_index_level_non_unique_cols(self):
# GH-46001
foo_index = Index([1, 2, 3], name="foo")
bar_index = Index([1, 2], name="bar")
series = Series([1, 2], index=bar_index, name="foo_series")
df = DataFrame(
np.arange(18).reshape(6, 3),
index=pd.MultiIndex.from_product([foo_index, bar_index]),
)
df.columns = ["cfoo", "cbar", "cfoo"]
expected = Series([1, 2] * 3, index=df.index, name="foo_series")
result_left, result_right = df.align(series, axis=0)
tm.assert_series_equal(result_right, expected)
tm.assert_index_equal(result_left.columns, df.columns)
def test_missing_axis_specification_exception(self):
df = DataFrame(np.arange(50).reshape((10, 5)))
series = Series(np.arange(5))
with pytest.raises(ValueError, match=r"axis=0 or 1"):
df.align(series)
@pytest.mark.parametrize("method", ["pad", "bfill"])
@pytest.mark.parametrize("axis", [0, 1, None])
@pytest.mark.parametrize("fill_axis", [0, 1])
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize(
"left_slice",
[
[slice(4), slice(10)],
[slice(0), slice(0)],
],
)
@pytest.mark.parametrize(
"right_slice",
[
[slice(2, None), slice(6, None)],
[slice(0), slice(0)],
],
)
@pytest.mark.parametrize("limit", [1, None])
def test_align_fill_method(
self, how, method, axis, fill_axis, float_frame, left_slice, right_slice, limit
):
frame = float_frame
left = frame.iloc[left_slice[0], left_slice[1]]
right = frame.iloc[right_slice[0], right_slice[1]]
msg = (
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
"are deprecated"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
aa, ab = left.align(
right,
axis=axis,
join=how,
method=method,
limit=limit,
fill_axis=fill_axis,
)
join_index, join_columns = None, None
ea, eb = left, right
if axis is None or axis == 0:
join_index = left.index.join(right.index, how=how)
ea = ea.reindex(index=join_index)
eb = eb.reindex(index=join_index)
if axis is None or axis == 1:
join_columns = left.columns.join(right.columns, how=how)
ea = ea.reindex(columns=join_columns)
eb = eb.reindex(columns=join_columns)
msg = "DataFrame.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
ea = ea.fillna(axis=fill_axis, method=method, limit=limit)
eb = eb.fillna(axis=fill_axis, method=method, limit=limit)
tm.assert_frame_equal(aa, ea)
tm.assert_frame_equal(ab, eb)
def test_align_series_check_copy(self):
# GH#
df = DataFrame({0: [1, 2]})
ser = Series([1], name=0)
expected = ser.copy()
result, other = df.align(ser, axis=1)
ser.iloc[0] = 100
tm.assert_series_equal(other, expected)
def test_align_identical_different_object(self):
# GH#51032
df = DataFrame({"a": [1, 2]})
ser = Series([3, 4])
result, result2 = df.align(ser, axis=0)
tm.assert_frame_equal(result, df)
tm.assert_series_equal(result2, ser)
assert df is not result
assert ser is not result2
def test_align_identical_different_object_columns(self):
# GH#51032
df = DataFrame({"a": [1, 2]})
ser = Series([1], index=["a"])
result, result2 = df.align(ser, axis=1)
tm.assert_frame_equal(result, df)
tm.assert_series_equal(result2, ser)
assert df is not result
assert ser is not result2

View File

@ -0,0 +1,263 @@
from datetime import datetime
import numpy as np
import pytest
from pandas._libs.tslibs.offsets import MonthEnd
from pandas import (
DataFrame,
DatetimeIndex,
Series,
date_range,
period_range,
to_datetime,
)
import pandas._testing as tm
from pandas.tseries import offsets
class TestAsFreq:
@pytest.fixture(params=["s", "ms", "us", "ns"])
def unit(self, request):
return request.param
def test_asfreq2(self, frame_or_series):
ts = frame_or_series(
[0.0, 1.0, 2.0],
index=DatetimeIndex(
[
datetime(2009, 10, 30),
datetime(2009, 11, 30),
datetime(2009, 12, 31),
],
dtype="M8[ns]",
freq="BME",
),
)
daily_ts = ts.asfreq("B")
monthly_ts = daily_ts.asfreq("BME")
tm.assert_equal(monthly_ts, ts)
daily_ts = ts.asfreq("B", method="pad")
monthly_ts = daily_ts.asfreq("BME")
tm.assert_equal(monthly_ts, ts)
daily_ts = ts.asfreq(offsets.BDay())
monthly_ts = daily_ts.asfreq(offsets.BMonthEnd())
tm.assert_equal(monthly_ts, ts)
result = ts[:0].asfreq("ME")
assert len(result) == 0
assert result is not ts
if frame_or_series is Series:
daily_ts = ts.asfreq("D", fill_value=-1)
result = daily_ts.value_counts().sort_index()
expected = Series(
[60, 1, 1, 1], index=[-1.0, 2.0, 1.0, 0.0], name="count"
).sort_index()
tm.assert_series_equal(result, expected)
def test_asfreq_datetimeindex_empty(self, frame_or_series):
# GH#14320
index = DatetimeIndex(["2016-09-29 11:00"])
expected = frame_or_series(index=index, dtype=object).asfreq("h")
result = frame_or_series([3], index=index.copy()).asfreq("h")
tm.assert_index_equal(expected.index, result.index)
@pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"])
def test_tz_aware_asfreq_smoke(self, tz, frame_or_series):
dr = date_range("2011-12-01", "2012-07-20", freq="D", tz=tz)
obj = frame_or_series(
np.random.default_rng(2).standard_normal(len(dr)), index=dr
)
# it works!
obj.asfreq("min")
def test_asfreq_normalize(self, frame_or_series):
rng = date_range("1/1/2000 09:30", periods=20)
norm = date_range("1/1/2000", periods=20)
vals = np.random.default_rng(2).standard_normal((20, 3))
obj = DataFrame(vals, index=rng)
expected = DataFrame(vals, index=norm)
if frame_or_series is Series:
obj = obj[0]
expected = expected[0]
result = obj.asfreq("D", normalize=True)
tm.assert_equal(result, expected)
def test_asfreq_keep_index_name(self, frame_or_series):
# GH#9854
index_name = "bar"
index = date_range("20130101", periods=20, name=index_name)
obj = DataFrame(list(range(20)), columns=["foo"], index=index)
obj = tm.get_obj(obj, frame_or_series)
assert index_name == obj.index.name
assert index_name == obj.asfreq("10D").index.name
def test_asfreq_ts(self, frame_or_series):
index = period_range(freq="Y", start="1/1/2001", end="12/31/2010")
obj = DataFrame(
np.random.default_rng(2).standard_normal((len(index), 3)), index=index
)
obj = tm.get_obj(obj, frame_or_series)
result = obj.asfreq("D", how="end")
exp_index = index.asfreq("D", how="end")
assert len(result) == len(obj)
tm.assert_index_equal(result.index, exp_index)
result = obj.asfreq("D", how="start")
exp_index = index.asfreq("D", how="start")
assert len(result) == len(obj)
tm.assert_index_equal(result.index, exp_index)
def test_asfreq_resample_set_correct_freq(self, frame_or_series):
# GH#5613
# we test if .asfreq() and .resample() set the correct value for .freq
dti = to_datetime(["2012-01-01", "2012-01-02", "2012-01-03"])
obj = DataFrame({"col": [1, 2, 3]}, index=dti)
obj = tm.get_obj(obj, frame_or_series)
# testing the settings before calling .asfreq() and .resample()
assert obj.index.freq is None
assert obj.index.inferred_freq == "D"
# does .asfreq() set .freq correctly?
assert obj.asfreq("D").index.freq == "D"
# does .resample() set .freq correctly?
assert obj.resample("D").asfreq().index.freq == "D"
def test_asfreq_empty(self, datetime_frame):
# test does not blow up on length-0 DataFrame
zero_length = datetime_frame.reindex([])
result = zero_length.asfreq("BME")
assert result is not zero_length
def test_asfreq(self, datetime_frame):
offset_monthly = datetime_frame.asfreq(offsets.BMonthEnd())
rule_monthly = datetime_frame.asfreq("BME")
tm.assert_frame_equal(offset_monthly, rule_monthly)
rule_monthly.asfreq("B", method="pad")
# TODO: actually check that this worked.
# don't forget!
rule_monthly.asfreq("B", method="pad")
def test_asfreq_datetimeindex(self):
df = DataFrame(
{"A": [1, 2, 3]},
index=[datetime(2011, 11, 1), datetime(2011, 11, 2), datetime(2011, 11, 3)],
)
df = df.asfreq("B")
assert isinstance(df.index, DatetimeIndex)
ts = df["A"].asfreq("B")
assert isinstance(ts.index, DatetimeIndex)
def test_asfreq_fillvalue(self):
# test for fill value during upsampling, related to issue 3715
# setup
rng = date_range("1/1/2016", periods=10, freq="2s")
# Explicit cast to 'float' to avoid implicit cast when setting None
ts = Series(np.arange(len(rng)), index=rng, dtype="float")
df = DataFrame({"one": ts})
# insert pre-existing missing value
df.loc["2016-01-01 00:00:08", "one"] = None
actual_df = df.asfreq(freq="1s", fill_value=9.0)
expected_df = df.asfreq(freq="1s").fillna(9.0)
expected_df.loc["2016-01-01 00:00:08", "one"] = None
tm.assert_frame_equal(expected_df, actual_df)
expected_series = ts.asfreq(freq="1s").fillna(9.0)
actual_series = ts.asfreq(freq="1s", fill_value=9.0)
tm.assert_series_equal(expected_series, actual_series)
def test_asfreq_with_date_object_index(self, frame_or_series):
rng = date_range("1/1/2000", periods=20)
ts = frame_or_series(np.random.default_rng(2).standard_normal(20), index=rng)
ts2 = ts.copy()
ts2.index = [x.date() for x in ts2.index]
result = ts2.asfreq("4h", method="ffill")
expected = ts.asfreq("4h", method="ffill")
tm.assert_equal(result, expected)
def test_asfreq_with_unsorted_index(self, frame_or_series):
# GH#39805
# Test that rows are not dropped when the datetime index is out of order
index = to_datetime(["2021-01-04", "2021-01-02", "2021-01-03", "2021-01-01"])
result = frame_or_series(range(4), index=index)
expected = result.reindex(sorted(index))
expected.index = expected.index._with_freq("infer")
result = result.asfreq("D")
tm.assert_equal(result, expected)
def test_asfreq_after_normalize(self, unit):
# https://github.com/pandas-dev/pandas/issues/50727
result = DatetimeIndex(
date_range("2000", periods=2).as_unit(unit).normalize(), freq="D"
)
expected = DatetimeIndex(["2000-01-01", "2000-01-02"], freq="D").as_unit(unit)
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize(
"freq, freq_half",
[
("2ME", "ME"),
(MonthEnd(2), MonthEnd(1)),
],
)
def test_asfreq_2ME(self, freq, freq_half):
index = date_range("1/1/2000", periods=6, freq=freq_half)
df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0, 4.0, 5.0], index=index)})
expected = df.asfreq(freq=freq)
index = date_range("1/1/2000", periods=3, freq=freq)
result = DataFrame({"s": Series([0.0, 2.0, 4.0], index=index)})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"freq, freq_depr",
[
("2ME", "2M"),
("2QE", "2Q"),
("2QE-SEP", "2Q-SEP"),
("1BQE", "1BQ"),
("2BQE-SEP", "2BQ-SEP"),
("1YE", "1Y"),
("2YE-MAR", "2Y-MAR"),
("1YE", "1A"),
("2YE-MAR", "2A-MAR"),
("2BYE-MAR", "2BA-MAR"),
],
)
def test_asfreq_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr):
# GH#9586, #55978
depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed "
f"in a future version, please use '{freq[1:]}' instead."
index = date_range("1/1/2000", periods=4, freq=f"{freq[1:]}")
df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0], index=index)})
expected = df.asfreq(freq=freq)
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
result = df.asfreq(freq=freq_depr)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,198 @@
import numpy as np
import pytest
from pandas._libs.tslibs import IncompatibleFrequency
from pandas import (
DataFrame,
Period,
Series,
Timestamp,
date_range,
period_range,
to_datetime,
)
import pandas._testing as tm
@pytest.fixture
def date_range_frame():
"""
Fixture for DataFrame of ints with date_range index
Columns are ['A', 'B'].
"""
N = 50
rng = date_range("1/1/1990", periods=N, freq="53s")
return DataFrame({"A": np.arange(N), "B": np.arange(N)}, index=rng)
class TestFrameAsof:
def test_basic(self, date_range_frame):
# Explicitly cast to float to avoid implicit cast when setting np.nan
df = date_range_frame.astype({"A": "float"})
N = 50
df.loc[df.index[15:30], "A"] = np.nan
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
result = df.asof(dates)
assert result.notna().all(1).all()
lb = df.index[14]
ub = df.index[30]
dates = list(dates)
result = df.asof(dates)
assert result.notna().all(1).all()
mask = (result.index >= lb) & (result.index < ub)
rs = result[mask]
assert (rs == 14).all(1).all()
def test_subset(self, date_range_frame):
N = 10
# explicitly cast to float to avoid implicit upcast when setting to np.nan
df = date_range_frame.iloc[:N].copy().astype({"A": "float"})
df.loc[df.index[4:8], "A"] = np.nan
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
# with a subset of A should be the same
result = df.asof(dates, subset="A")
expected = df.asof(dates)
tm.assert_frame_equal(result, expected)
# same with A/B
result = df.asof(dates, subset=["A", "B"])
expected = df.asof(dates)
tm.assert_frame_equal(result, expected)
# B gives df.asof
result = df.asof(dates, subset="B")
expected = df.resample("25s", closed="right").ffill().reindex(dates)
expected.iloc[20:] = 9
# no "missing", so "B" can retain int dtype (df["A"].dtype platform-dependent)
expected["B"] = expected["B"].astype(df["B"].dtype)
tm.assert_frame_equal(result, expected)
def test_missing(self, date_range_frame):
# GH 15118
# no match found - `where` value before earliest date in index
N = 10
# Cast to 'float64' to avoid upcast when introducing nan in df.asof
df = date_range_frame.iloc[:N].copy().astype("float64")
result = df.asof("1989-12-31")
expected = Series(
index=["A", "B"], name=Timestamp("1989-12-31"), dtype=np.float64
)
tm.assert_series_equal(result, expected)
result = df.asof(to_datetime(["1989-12-31"]))
expected = DataFrame(
index=to_datetime(["1989-12-31"]), columns=["A", "B"], dtype="float64"
)
tm.assert_frame_equal(result, expected)
# Check that we handle PeriodIndex correctly, dont end up with
# period.ordinal for series name
df = df.to_period("D")
result = df.asof("1989-12-31")
assert isinstance(result.name, Period)
def test_asof_all_nans(self, frame_or_series):
# GH 15713
# DataFrame/Series is all nans
result = frame_or_series([np.nan]).asof([0])
expected = frame_or_series([np.nan])
tm.assert_equal(result, expected)
def test_all_nans(self, date_range_frame):
# GH 15713
# DataFrame is all nans
# testing non-default indexes, multiple inputs
N = 150
rng = date_range_frame.index
dates = date_range("1/1/1990", periods=N, freq="25s")
result = DataFrame(np.nan, index=rng, columns=["A"]).asof(dates)
expected = DataFrame(np.nan, index=dates, columns=["A"])
tm.assert_frame_equal(result, expected)
# testing multiple columns
dates = date_range("1/1/1990", periods=N, freq="25s")
result = DataFrame(np.nan, index=rng, columns=["A", "B", "C"]).asof(dates)
expected = DataFrame(np.nan, index=dates, columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
# testing scalar input
result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof([3])
expected = DataFrame(np.nan, index=[3], columns=["A", "B"])
tm.assert_frame_equal(result, expected)
result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof(3)
expected = Series(np.nan, index=["A", "B"], name=3)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"stamp,expected",
[
(
Timestamp("2018-01-01 23:22:43.325+00:00"),
Series(2, name=Timestamp("2018-01-01 23:22:43.325+00:00")),
),
(
Timestamp("2018-01-01 22:33:20.682+01:00"),
Series(1, name=Timestamp("2018-01-01 22:33:20.682+01:00")),
),
],
)
def test_time_zone_aware_index(self, stamp, expected):
# GH21194
# Testing awareness of DataFrame index considering different
# UTC and timezone
df = DataFrame(
data=[1, 2],
index=[
Timestamp("2018-01-01 21:00:05.001+00:00"),
Timestamp("2018-01-01 22:35:10.550+00:00"),
],
)
result = df.asof(stamp)
tm.assert_series_equal(result, expected)
def test_is_copy(self, date_range_frame):
# GH-27357, GH-30784: ensure the result of asof is an actual copy and
# doesn't track the parent dataframe / doesn't give SettingWithCopy warnings
df = date_range_frame.astype({"A": "float"})
N = 50
df.loc[df.index[15:30], "A"] = np.nan
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
result = df.asof(dates)
with tm.assert_produces_warning(None):
result["C"] = 1
def test_asof_periodindex_mismatched_freq(self):
N = 50
rng = period_range("1/1/1990", periods=N, freq="h")
df = DataFrame(np.random.default_rng(2).standard_normal(N), index=rng)
# Mismatched freq
msg = "Input has different freq"
with pytest.raises(IncompatibleFrequency, match=msg):
df.asof(rng.asfreq("D"))
def test_asof_preserves_bool_dtype(self):
# GH#16063 was casting bools to floats
dti = date_range("2017-01-01", freq="MS", periods=4)
ser = Series([True, False, True], index=dti[:-1])
ts = dti[-1]
res = ser.asof([ts])
expected = Series([True], index=[ts])
tm.assert_series_equal(res, expected)

View File

@ -0,0 +1,84 @@
import pytest
from pandas import DataFrame
import pandas._testing as tm
class TestAssign:
def test_assign(self):
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
original = df.copy()
result = df.assign(C=df.B / df.A)
expected = df.copy()
expected["C"] = [4, 2.5, 2]
tm.assert_frame_equal(result, expected)
# lambda syntax
result = df.assign(C=lambda x: x.B / x.A)
tm.assert_frame_equal(result, expected)
# original is unmodified
tm.assert_frame_equal(df, original)
# Non-Series array-like
result = df.assign(C=[4, 2.5, 2])
tm.assert_frame_equal(result, expected)
# original is unmodified
tm.assert_frame_equal(df, original)
result = df.assign(B=df.B / df.A)
expected = expected.drop("B", axis=1).rename(columns={"C": "B"})
tm.assert_frame_equal(result, expected)
# overwrite
result = df.assign(A=df.A + df.B)
expected = df.copy()
expected["A"] = [5, 7, 9]
tm.assert_frame_equal(result, expected)
# lambda
result = df.assign(A=lambda x: x.A + x.B)
tm.assert_frame_equal(result, expected)
def test_assign_multiple(self):
df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=["A", "B"])
result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B)
expected = DataFrame(
[[1, 4, 7, 1, 4], [2, 5, 8, 2, 5], [3, 6, 9, 3, 6]], columns=list("ABCDE")
)
tm.assert_frame_equal(result, expected)
def test_assign_order(self):
# GH 9818
df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
result = df.assign(D=df.A + df.B, C=df.A - df.B)
expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], columns=list("ABDC"))
tm.assert_frame_equal(result, expected)
result = df.assign(C=df.A - df.B, D=df.A + df.B)
expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list("ABCD"))
tm.assert_frame_equal(result, expected)
def test_assign_bad(self):
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
# non-keyword argument
msg = r"assign\(\) takes 1 positional argument but 2 were given"
with pytest.raises(TypeError, match=msg):
df.assign(lambda x: x.A)
msg = "'DataFrame' object has no attribute 'C'"
with pytest.raises(AttributeError, match=msg):
df.assign(C=df.A, D=df.A + df.C)
def test_assign_dependent(self):
df = DataFrame({"A": [1, 2], "B": [3, 4]})
result = df.assign(C=df.A, D=lambda x: x["A"] + x["C"])
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD"))
tm.assert_frame_equal(result, expected)
result = df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"])
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD"))
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,911 @@
import re
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
Categorical,
CategoricalDtype,
DataFrame,
DatetimeTZDtype,
Index,
Interval,
IntervalDtype,
NaT,
Series,
Timedelta,
Timestamp,
concat,
date_range,
option_context,
)
import pandas._testing as tm
def _check_cast(df, v):
"""
Check if all dtypes of df are equal to v
"""
assert all(s.dtype.name == v for _, s in df.items())
class TestAstype:
def test_astype_float(self, float_frame):
casted = float_frame.astype(int)
expected = DataFrame(
float_frame.values.astype(int),
index=float_frame.index,
columns=float_frame.columns,
)
tm.assert_frame_equal(casted, expected)
casted = float_frame.astype(np.int32)
expected = DataFrame(
float_frame.values.astype(np.int32),
index=float_frame.index,
columns=float_frame.columns,
)
tm.assert_frame_equal(casted, expected)
float_frame["foo"] = "5"
casted = float_frame.astype(int)
expected = DataFrame(
float_frame.values.astype(int),
index=float_frame.index,
columns=float_frame.columns,
)
tm.assert_frame_equal(casted, expected)
def test_astype_mixed_float(self, mixed_float_frame):
# mixed casting
casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float32")
_check_cast(casted, "float32")
casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float16")
_check_cast(casted, "float16")
def test_astype_mixed_type(self):
# mixed casting
df = DataFrame(
{
"a": 1.0,
"b": 2,
"c": "foo",
"float32": np.array([1.0] * 10, dtype="float32"),
"int32": np.array([1] * 10, dtype="int32"),
},
index=np.arange(10),
)
mn = df._get_numeric_data().copy()
mn["little_float"] = np.array(12345.0, dtype="float16")
mn["big_float"] = np.array(123456789101112.0, dtype="float64")
casted = mn.astype("float64")
_check_cast(casted, "float64")
casted = mn.astype("int64")
_check_cast(casted, "int64")
casted = mn.reindex(columns=["little_float"]).astype("float16")
_check_cast(casted, "float16")
casted = mn.astype("float32")
_check_cast(casted, "float32")
casted = mn.astype("int32")
_check_cast(casted, "int32")
# to object
casted = mn.astype("O")
_check_cast(casted, "object")
def test_astype_with_exclude_string(self, float_frame):
df = float_frame.copy()
expected = float_frame.astype(int)
df["string"] = "foo"
casted = df.astype(int, errors="ignore")
expected["string"] = "foo"
tm.assert_frame_equal(casted, expected)
df = float_frame.copy()
expected = float_frame.astype(np.int32)
df["string"] = "foo"
casted = df.astype(np.int32, errors="ignore")
expected["string"] = "foo"
tm.assert_frame_equal(casted, expected)
def test_astype_with_view_float(self, float_frame):
# this is the only real reason to do it this way
tf = np.round(float_frame).astype(np.int32)
tf.astype(np.float32, copy=False)
# TODO(wesm): verification?
tf = float_frame.astype(np.float64)
tf.astype(np.int64, copy=False)
def test_astype_with_view_mixed_float(self, mixed_float_frame):
tf = mixed_float_frame.reindex(columns=["A", "B", "C"])
tf.astype(np.int64)
tf.astype(np.float32)
@pytest.mark.parametrize("dtype", [np.int32, np.int64])
@pytest.mark.parametrize("val", [np.nan, np.inf])
def test_astype_cast_nan_inf_int(self, val, dtype):
# see GH#14265
#
# Check NaN and inf --> raise error when converting to int.
msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
df = DataFrame([val])
with pytest.raises(ValueError, match=msg):
df.astype(dtype)
def test_astype_str(self):
# see GH#9757
a = Series(date_range("2010-01-04", periods=5))
b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
c = Series([Timedelta(x, unit="d") for x in range(5)])
d = Series(range(5))
e = Series([0.0, 0.2, 0.4, 0.6, 0.8])
df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})
# Datetime-like
result = df.astype(str)
expected = DataFrame(
{
"a": list(map(str, (Timestamp(x)._date_repr for x in a._values))),
"b": list(map(str, map(Timestamp, b._values))),
"c": [Timedelta(x)._repr_base() for x in c._values],
"d": list(map(str, d._values)),
"e": list(map(str, e._values)),
},
dtype="object",
)
tm.assert_frame_equal(result, expected)
def test_astype_str_float(self):
# see GH#11302
result = DataFrame([np.nan]).astype(str)
expected = DataFrame(["nan"], dtype="object")
tm.assert_frame_equal(result, expected)
result = DataFrame([1.12345678901234567890]).astype(str)
val = "1.1234567890123457"
expected = DataFrame([val], dtype="object")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype_class", [dict, Series])
def test_astype_dict_like(self, dtype_class):
# GH7271 & GH16717
a = Series(date_range("2010-01-04", periods=5))
b = Series(range(5))
c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
d = Series(["1.0", "2", "3.14", "4", "5.4"])
df = DataFrame({"a": a, "b": b, "c": c, "d": d})
original = df.copy(deep=True)
# change type of a subset of columns
dt1 = dtype_class({"b": "str", "d": "float32"})
result = df.astype(dt1)
expected = DataFrame(
{
"a": a,
"b": Series(["0", "1", "2", "3", "4"], dtype="object"),
"c": c,
"d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"),
}
)
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(df, original)
dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64})
result = df.astype(dt2)
expected = DataFrame(
{
"a": a,
"b": Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"),
"c": Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"),
"d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"),
}
)
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(df, original)
# change all columns
dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str})
tm.assert_frame_equal(df.astype(dt3), df.astype(str))
tm.assert_frame_equal(df, original)
# error should be raised when using something other than column labels
# in the keys of the dtype dict
dt4 = dtype_class({"b": str, 2: str})
dt5 = dtype_class({"e": str})
msg_frame = (
"Only a column name can be used for the key in a dtype mappings argument. "
"'{}' not found in columns."
)
with pytest.raises(KeyError, match=msg_frame.format(2)):
df.astype(dt4)
with pytest.raises(KeyError, match=msg_frame.format("e")):
df.astype(dt5)
tm.assert_frame_equal(df, original)
# if the dtypes provided are the same as the original dtypes, the
# resulting DataFrame should be the same as the original DataFrame
dt6 = dtype_class({col: df[col].dtype for col in df.columns})
equiv = df.astype(dt6)
tm.assert_frame_equal(df, equiv)
tm.assert_frame_equal(df, original)
# GH#16717
# if dtypes provided is empty, the resulting DataFrame
# should be the same as the original DataFrame
dt7 = dtype_class({}) if dtype_class is dict else dtype_class({}, dtype=object)
equiv = df.astype(dt7)
tm.assert_frame_equal(df, equiv)
tm.assert_frame_equal(df, original)
def test_astype_duplicate_col(self):
a1 = Series([1, 2, 3, 4, 5], name="a")
b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b")
a2 = Series([0, 1, 2, 3, 4], name="a")
df = concat([a1, b, a2], axis=1)
result = df.astype(str)
a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a")
b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b")
a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a")
expected = concat([a1_str, b_str, a2_str], axis=1)
tm.assert_frame_equal(result, expected)
result = df.astype({"a": "str"})
expected = concat([a1_str, b, a2_str], axis=1)
tm.assert_frame_equal(result, expected)
def test_astype_duplicate_col_series_arg(self):
# GH#44417
vals = np.random.default_rng(2).standard_normal((3, 4))
df = DataFrame(vals, columns=["A", "B", "C", "A"])
dtypes = df.dtypes
dtypes.iloc[0] = str
dtypes.iloc[2] = "Float64"
result = df.astype(dtypes)
expected = DataFrame(
{
0: Series(vals[:, 0].astype(str), dtype=object),
1: vals[:, 1],
2: pd.array(vals[:, 2], dtype="Float64"),
3: vals[:, 3],
}
)
expected.columns = df.columns
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype",
[
"category",
CategoricalDtype(),
CategoricalDtype(ordered=True),
CategoricalDtype(ordered=False),
CategoricalDtype(categories=list("abcdef")),
CategoricalDtype(categories=list("edba"), ordered=False),
CategoricalDtype(categories=list("edcb"), ordered=True),
],
ids=repr,
)
def test_astype_categorical(self, dtype):
# GH#18099
d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")}
df = DataFrame(d)
result = df.astype(dtype)
expected = DataFrame({k: Categorical(v, dtype=dtype) for k, v in d.items()})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("cls", [CategoricalDtype, DatetimeTZDtype, IntervalDtype])
def test_astype_categoricaldtype_class_raises(self, cls):
df = DataFrame({"A": ["a", "a", "b", "c"]})
xpr = f"Expected an instance of {cls.__name__}"
with pytest.raises(TypeError, match=xpr):
df.astype({"A": cls})
with pytest.raises(TypeError, match=xpr):
df["A"].astype(cls)
@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
def test_astype_extension_dtypes(self, dtype):
# GH#22578
df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"])
expected1 = DataFrame(
{
"a": pd.array([1, 3, 5], dtype=dtype),
"b": pd.array([2, 4, 6], dtype=dtype),
}
)
tm.assert_frame_equal(df.astype(dtype), expected1)
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
tm.assert_frame_equal(df.astype(dtype).astype("float64"), df)
df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"])
df["b"] = df["b"].astype(dtype)
expected2 = DataFrame(
{"a": [1.0, 3.0, 5.0], "b": pd.array([2, 4, 6], dtype=dtype)}
)
tm.assert_frame_equal(df, expected2)
tm.assert_frame_equal(df.astype(dtype), expected1)
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
def test_astype_extension_dtypes_1d(self, dtype):
# GH#22578
df = DataFrame({"a": [1.0, 2.0, 3.0]})
expected1 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)})
tm.assert_frame_equal(df.astype(dtype), expected1)
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
df = DataFrame({"a": [1.0, 2.0, 3.0]})
df["a"] = df["a"].astype(dtype)
expected2 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)})
tm.assert_frame_equal(df, expected2)
tm.assert_frame_equal(df.astype(dtype), expected1)
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
@pytest.mark.parametrize("dtype", ["category", "Int64"])
def test_astype_extension_dtypes_duplicate_col(self, dtype):
# GH#24704
a1 = Series([0, np.nan, 4], name="a")
a2 = Series([np.nan, 3, 5], name="a")
df = concat([a1, a2], axis=1)
result = df.astype(dtype)
expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype", [{100: "float64", 200: "uint64"}, "category", "float64"]
)
def test_astype_column_metadata(self, dtype):
# GH#19920
columns = Index([100, 200, 300], dtype=np.uint64, name="foo")
df = DataFrame(np.arange(15).reshape(5, 3), columns=columns)
df = df.astype(dtype)
tm.assert_index_equal(df.columns, columns)
@pytest.mark.parametrize("unit", ["Y", "M", "W", "D", "h", "m"])
def test_astype_from_object_to_datetime_unit(self, unit):
vals = [
["2015-01-01", "2015-01-02", "2015-01-03"],
["2017-01-01", "2017-01-02", "2017-02-03"],
]
df = DataFrame(vals, dtype=object)
msg = (
rf"Unexpected value for 'dtype': 'datetime64\[{unit}\]'. "
r"Must be 'datetime64\[s\]', 'datetime64\[ms\]', 'datetime64\[us\]', "
r"'datetime64\[ns\]' or DatetimeTZDtype"
)
with pytest.raises(ValueError, match=msg):
df.astype(f"M8[{unit}]")
@pytest.mark.parametrize("unit", ["Y", "M", "W", "D", "h", "m"])
def test_astype_from_object_to_timedelta_unit(self, unit):
vals = [
["1 Day", "2 Days", "3 Days"],
["4 Days", "5 Days", "6 Days"],
]
df = DataFrame(vals, dtype=object)
msg = (
r"Cannot convert from timedelta64\[ns\] to timedelta64\[.*\]. "
"Supported resolutions are 's', 'ms', 'us', 'ns'"
)
with pytest.raises(ValueError, match=msg):
# TODO: this is ValueError while for DatetimeArray it is TypeError;
# get these consistent
df.astype(f"m8[{unit}]")
@pytest.mark.parametrize("dtype", ["M8", "m8"])
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
def test_astype_from_datetimelike_to_object(self, dtype, unit):
# tests astype to object dtype
# GH#19223 / GH#12425
dtype = f"{dtype}[{unit}]"
arr = np.array([[1, 2, 3]], dtype=dtype)
df = DataFrame(arr)
result = df.astype(object)
assert (result.dtypes == object).all()
if dtype.startswith("M8"):
assert result.iloc[0, 0] == Timestamp(1, unit=unit)
else:
assert result.iloc[0, 0] == Timedelta(1, unit=unit)
@pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
@pytest.mark.parametrize("dtype", ["M8", "m8"])
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
# tests all units from numeric origination
# GH#19223 / GH#12425
dtype = f"{dtype}[{unit}]"
arr = np.array([[1, 2, 3]], dtype=arr_dtype)
df = DataFrame(arr)
result = df.astype(dtype)
expected = DataFrame(arr.astype(dtype))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
def test_astype_to_datetime_unit(self, unit):
# tests all units from datetime origination
# GH#19223
dtype = f"M8[{unit}]"
arr = np.array([[1, 2, 3]], dtype=dtype)
df = DataFrame(arr)
ser = df.iloc[:, 0]
idx = Index(ser)
dta = ser._values
if unit in ["ns", "us", "ms", "s"]:
# GH#48928
result = df.astype(dtype)
else:
# we use the nearest supported dtype (i.e. M8[s])
msg = rf"Cannot cast DatetimeArray to dtype datetime64\[{unit}\]"
with pytest.raises(TypeError, match=msg):
df.astype(dtype)
with pytest.raises(TypeError, match=msg):
ser.astype(dtype)
with pytest.raises(TypeError, match=msg.replace("Array", "Index")):
idx.astype(dtype)
with pytest.raises(TypeError, match=msg):
dta.astype(dtype)
return
exp_df = DataFrame(arr.astype(dtype))
assert (exp_df.dtypes == dtype).all()
tm.assert_frame_equal(result, exp_df)
res_ser = ser.astype(dtype)
exp_ser = exp_df.iloc[:, 0]
assert exp_ser.dtype == dtype
tm.assert_series_equal(res_ser, exp_ser)
exp_dta = exp_ser._values
res_index = idx.astype(dtype)
exp_index = Index(exp_ser)
assert exp_index.dtype == dtype
tm.assert_index_equal(res_index, exp_index)
res_dta = dta.astype(dtype)
assert exp_dta.dtype == dtype
tm.assert_extension_array_equal(res_dta, exp_dta)
@pytest.mark.parametrize("unit", ["ns"])
def test_astype_to_timedelta_unit_ns(self, unit):
# preserver the timedelta conversion
# GH#19223
dtype = f"m8[{unit}]"
arr = np.array([[1, 2, 3]], dtype=dtype)
df = DataFrame(arr)
result = df.astype(dtype)
expected = DataFrame(arr.astype(dtype))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"])
def test_astype_to_timedelta_unit(self, unit):
# coerce to float
# GH#19223 until 2.0 used to coerce to float
dtype = f"m8[{unit}]"
arr = np.array([[1, 2, 3]], dtype=dtype)
df = DataFrame(arr)
ser = df.iloc[:, 0]
tdi = Index(ser)
tda = tdi._values
if unit in ["us", "ms", "s"]:
assert (df.dtypes == dtype).all()
result = df.astype(dtype)
else:
# We get the nearest supported unit, i.e. "s"
assert (df.dtypes == "m8[s]").all()
msg = (
rf"Cannot convert from timedelta64\[s\] to timedelta64\[{unit}\]. "
"Supported resolutions are 's', 'ms', 'us', 'ns'"
)
with pytest.raises(ValueError, match=msg):
df.astype(dtype)
with pytest.raises(ValueError, match=msg):
ser.astype(dtype)
with pytest.raises(ValueError, match=msg):
tdi.astype(dtype)
with pytest.raises(ValueError, match=msg):
tda.astype(dtype)
return
result = df.astype(dtype)
# The conversion is a no-op, so we just get a copy
expected = df
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
def test_astype_to_incorrect_datetimelike(self, unit):
# trying to astype a m to a M, or vice-versa
# GH#19224
dtype = f"M8[{unit}]"
other = f"m8[{unit}]"
df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
msg = "|".join(
[
# BlockManager path
rf"Cannot cast DatetimeArray to dtype timedelta64\[{unit}\]",
# ArrayManager path
"cannot astype a datetimelike from "
rf"\[datetime64\[ns\]\] to \[timedelta64\[{unit}\]\]",
]
)
with pytest.raises(TypeError, match=msg):
df.astype(other)
msg = "|".join(
[
# BlockManager path
rf"Cannot cast TimedeltaArray to dtype datetime64\[{unit}\]",
# ArrayManager path
"cannot astype a timedelta from "
rf"\[timedelta64\[ns\]\] to \[datetime64\[{unit}\]\]",
]
)
df = DataFrame(np.array([[1, 2, 3]], dtype=other))
with pytest.raises(TypeError, match=msg):
df.astype(dtype)
def test_astype_arg_for_errors(self):
# GH#14878
df = DataFrame([1, 2, 3])
msg = (
"Expected value of kwarg 'errors' to be one of "
"['raise', 'ignore']. Supplied value is 'True'"
)
with pytest.raises(ValueError, match=re.escape(msg)):
df.astype(np.float64, errors=True)
df.astype(np.int8, errors="ignore")
def test_astype_invalid_conversion(self):
# GH#47571
df = DataFrame({"a": [1, 2, "text"], "b": [1, 2, 3]})
msg = (
"invalid literal for int() with base 10: 'text': "
"Error while type casting for column 'a'"
)
with pytest.raises(ValueError, match=re.escape(msg)):
df.astype({"a": int})
def test_astype_arg_for_errors_dictlist(self):
# GH#25905
df = DataFrame(
[
{"a": "1", "b": "16.5%", "c": "test"},
{"a": "2.2", "b": "15.3", "c": "another_test"},
]
)
expected = DataFrame(
[
{"a": 1.0, "b": "16.5%", "c": "test"},
{"a": 2.2, "b": "15.3", "c": "another_test"},
]
)
expected["c"] = expected["c"].astype("object")
type_dict = {"a": "float64", "b": "float64", "c": "object"}
result = df.astype(dtype=type_dict, errors="ignore")
tm.assert_frame_equal(result, expected)
def test_astype_dt64tz(self, timezone_frame):
# astype
expected = np.array(
[
[
Timestamp("2013-01-01 00:00:00"),
Timestamp("2013-01-02 00:00:00"),
Timestamp("2013-01-03 00:00:00"),
],
[
Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"),
NaT,
Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"),
],
[
Timestamp("2013-01-01 00:00:00+0100", tz="CET"),
NaT,
Timestamp("2013-01-03 00:00:00+0100", tz="CET"),
],
],
dtype=object,
).T
expected = DataFrame(
expected,
index=timezone_frame.index,
columns=timezone_frame.columns,
dtype=object,
)
result = timezone_frame.astype(object)
tm.assert_frame_equal(result, expected)
msg = "Cannot use .astype to convert from timezone-aware dtype to timezone-"
with pytest.raises(TypeError, match=msg):
# dt64tz->dt64 deprecated
timezone_frame.astype("datetime64[ns]")
def test_astype_dt64tz_to_str(self, timezone_frame):
# str formatting
result = timezone_frame.astype(str)
expected = DataFrame(
[
[
"2013-01-01",
"2013-01-01 00:00:00-05:00",
"2013-01-01 00:00:00+01:00",
],
["2013-01-02", "NaT", "NaT"],
[
"2013-01-03",
"2013-01-03 00:00:00-05:00",
"2013-01-03 00:00:00+01:00",
],
],
columns=timezone_frame.columns,
dtype="object",
)
tm.assert_frame_equal(result, expected)
with option_context("display.max_columns", 20):
result = str(timezone_frame)
assert (
"0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00"
) in result
assert (
"1 2013-01-02 NaT NaT"
) in result
assert (
"2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00"
) in result
def test_astype_empty_dtype_dict(self):
# issue mentioned further down in the following issue's thread
# https://github.com/pandas-dev/pandas/issues/33113
df = DataFrame()
result = df.astype({})
tm.assert_frame_equal(result, df)
assert result is not df
@pytest.mark.parametrize(
"data, dtype",
[
(["x", "y", "z"], "string[python]"),
pytest.param(
["x", "y", "z"],
"string[pyarrow]",
marks=td.skip_if_no("pyarrow"),
),
(["x", "y", "z"], "category"),
(3 * [Timestamp("2020-01-01", tz="UTC")], None),
(3 * [Interval(0, 1)], None),
],
)
@pytest.mark.parametrize("errors", ["raise", "ignore"])
def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors):
# https://github.com/pandas-dev/pandas/issues/35471
df = DataFrame(Series(data, dtype=dtype))
if errors == "ignore":
expected = df
result = df.astype(float, errors=errors)
tm.assert_frame_equal(result, expected)
else:
msg = "(Cannot cast)|(could not convert)"
with pytest.raises((ValueError, TypeError), match=msg):
df.astype(float, errors=errors)
def test_astype_tz_conversion(self):
# GH 35973
val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")}
df = DataFrame(val)
result = df.astype({"tz": "datetime64[ns, Europe/Berlin]"})
expected = df
expected["tz"] = expected["tz"].dt.tz_convert("Europe/Berlin")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("tz", ["UTC", "Europe/Berlin"])
def test_astype_tz_object_conversion(self, tz):
# GH 35973
val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")}
expected = DataFrame(val)
# convert expected to object dtype from other tz str (independently tested)
result = expected.astype({"tz": f"datetime64[ns, {tz}]"})
result = result.astype({"tz": "object"})
# do real test: object dtype to a specified tz, different from construction tz.
result = result.astype({"tz": "datetime64[ns, Europe/London]"})
tm.assert_frame_equal(result, expected)
def test_astype_dt64_to_string(
self, frame_or_series, tz_naive_fixture, using_infer_string
):
# GH#41409
tz = tz_naive_fixture
dti = date_range("2016-01-01", periods=3, tz=tz)
dta = dti._data
dta[0] = NaT
obj = frame_or_series(dta)
result = obj.astype("string")
# Check that Series/DataFrame.astype matches DatetimeArray.astype
expected = frame_or_series(dta.astype("string"))
tm.assert_equal(result, expected)
item = result.iloc[0]
if frame_or_series is DataFrame:
item = item.iloc[0]
if using_infer_string:
assert item is np.nan
else:
assert item is pd.NA
# For non-NA values, we should match what we get for non-EA str
alt = obj.astype(str)
assert np.all(alt.iloc[1:] == result.iloc[1:])
def test_astype_td64_to_string(self, frame_or_series):
# GH#41409
tdi = pd.timedelta_range("1 Day", periods=3)
obj = frame_or_series(tdi)
expected = frame_or_series(["1 days", "2 days", "3 days"], dtype="string")
result = obj.astype("string")
tm.assert_equal(result, expected)
def test_astype_bytes(self):
# GH#39474
result = DataFrame(["foo", "bar", "baz"]).astype(bytes)
assert result.dtypes[0] == np.dtype("S3")
@pytest.mark.parametrize(
"index_slice",
[
np.s_[:2, :2],
np.s_[:1, :2],
np.s_[:2, :1],
np.s_[::2, ::2],
np.s_[::1, ::2],
np.s_[::2, ::1],
],
)
def test_astype_noncontiguous(self, index_slice):
# GH#42396
data = np.arange(16).reshape(4, 4)
df = DataFrame(data)
result = df.iloc[index_slice].astype("int16")
expected = df.iloc[index_slice]
tm.assert_frame_equal(result, expected, check_dtype=False)
def test_astype_retain_attrs(self, any_numpy_dtype):
# GH#44414
df = DataFrame({"a": [0, 1, 2], "b": [3, 4, 5]})
df.attrs["Location"] = "Michigan"
result = df.astype({"a": any_numpy_dtype}).attrs
expected = df.attrs
tm.assert_dict_equal(expected, result)
class TestAstypeCategorical:
def test_astype_from_categorical3(self):
df = DataFrame({"cats": [1, 2, 3, 4, 5, 6], "vals": [1, 2, 3, 4, 5, 6]})
cats = Categorical([1, 2, 3, 4, 5, 6])
exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
df["cats"] = df["cats"].astype("category")
tm.assert_frame_equal(exp_df, df)
def test_astype_from_categorical4(self):
df = DataFrame(
{"cats": ["a", "b", "b", "a", "a", "d"], "vals": [1, 2, 3, 4, 5, 6]}
)
cats = Categorical(["a", "b", "b", "a", "a", "d"])
exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
df["cats"] = df["cats"].astype("category")
tm.assert_frame_equal(exp_df, df)
def test_categorical_astype_to_int(self, any_int_dtype):
# GH#39402
df = DataFrame(data={"col1": pd.array([2.0, 1.0, 3.0])})
df.col1 = df.col1.astype("category")
df.col1 = df.col1.astype(any_int_dtype)
expected = DataFrame({"col1": pd.array([2, 1, 3], dtype=any_int_dtype)})
tm.assert_frame_equal(df, expected)
def test_astype_categorical_to_string_missing(self):
# https://github.com/pandas-dev/pandas/issues/41797
df = DataFrame(["a", "b", np.nan])
expected = df.astype(str)
cat = df.astype("category")
result = cat.astype(str)
tm.assert_frame_equal(result, expected)
class IntegerArrayNoCopy(pd.core.arrays.IntegerArray):
# GH 42501
def copy(self):
assert False
class Int16DtypeNoCopy(pd.Int16Dtype):
# GH 42501
@classmethod
def construct_array_type(cls):
return IntegerArrayNoCopy
def test_frame_astype_no_copy():
# GH 42501
df = DataFrame({"a": [1, 4, None, 5], "b": [6, 7, 8, 9]}, dtype=object)
result = df.astype({"a": Int16DtypeNoCopy()}, copy=False)
assert result.a.dtype == pd.Int16Dtype()
assert np.shares_memory(df.b.values, result.b.values)
@pytest.mark.parametrize("dtype", ["int64", "Int64"])
def test_astype_copies(dtype):
# GH#50984
pytest.importorskip("pyarrow")
df = DataFrame({"a": [1, 2, 3]}, dtype=dtype)
result = df.astype("int64[pyarrow]", copy=True)
df.iloc[0, 0] = 100
expected = DataFrame({"a": [1, 2, 3]}, dtype="int64[pyarrow]")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT])
def test_astype_to_string_not_modifying_input(string_storage, val):
# GH#51073
df = DataFrame({"a": ["a", "b", val]})
expected = df.copy()
with option_context("mode.string_storage", string_storage):
df.astype("string", copy=False)
tm.assert_frame_equal(df, expected)

View File

@ -0,0 +1,132 @@
from datetime import time
import numpy as np
import pytest
import pytz
from pandas._libs.tslibs import timezones
from pandas import (
DataFrame,
date_range,
)
import pandas._testing as tm
class TestAtTime:
@pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"])
def test_localized_at_time(self, tzstr, frame_or_series):
tz = timezones.maybe_get_tz(tzstr)
rng = date_range("4/16/2012", "5/1/2012", freq="h")
ts = frame_or_series(
np.random.default_rng(2).standard_normal(len(rng)), index=rng
)
ts_local = ts.tz_localize(tzstr)
result = ts_local.at_time(time(10, 0))
expected = ts.at_time(time(10, 0)).tz_localize(tzstr)
tm.assert_equal(result, expected)
assert timezones.tz_compare(result.index.tz, tz)
def test_at_time(self, frame_or_series):
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
ts = DataFrame(
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
)
ts = tm.get_obj(ts, frame_or_series)
rs = ts.at_time(rng[1])
assert (rs.index.hour == rng[1].hour).all()
assert (rs.index.minute == rng[1].minute).all()
assert (rs.index.second == rng[1].second).all()
result = ts.at_time("9:30")
expected = ts.at_time(time(9, 30))
tm.assert_equal(result, expected)
def test_at_time_midnight(self, frame_or_series):
# midnight, everything
rng = date_range("1/1/2000", "1/31/2000")
ts = DataFrame(
np.random.default_rng(2).standard_normal((len(rng), 3)), index=rng
)
ts = tm.get_obj(ts, frame_or_series)
result = ts.at_time(time(0, 0))
tm.assert_equal(result, ts)
def test_at_time_nonexistent(self, frame_or_series):
# time doesn't exist
rng = date_range("1/1/2012", freq="23Min", periods=384)
ts = DataFrame(np.random.default_rng(2).standard_normal(len(rng)), rng)
ts = tm.get_obj(ts, frame_or_series)
rs = ts.at_time("16:00")
assert len(rs) == 0
@pytest.mark.parametrize(
"hour", ["1:00", "1:00AM", time(1), time(1, tzinfo=pytz.UTC)]
)
def test_at_time_errors(self, hour):
# GH#24043
dti = date_range("2018", periods=3, freq="h")
df = DataFrame(list(range(len(dti))), index=dti)
if getattr(hour, "tzinfo", None) is None:
result = df.at_time(hour)
expected = df.iloc[1:2]
tm.assert_frame_equal(result, expected)
else:
with pytest.raises(ValueError, match="Index must be timezone"):
df.at_time(hour)
def test_at_time_tz(self):
# GH#24043
dti = date_range("2018", periods=3, freq="h", tz="US/Pacific")
df = DataFrame(list(range(len(dti))), index=dti)
result = df.at_time(time(4, tzinfo=pytz.timezone("US/Eastern")))
expected = df.iloc[1:2]
tm.assert_frame_equal(result, expected)
def test_at_time_raises(self, frame_or_series):
# GH#20725
obj = DataFrame([[1, 2, 3], [4, 5, 6]])
obj = tm.get_obj(obj, frame_or_series)
msg = "Index must be DatetimeIndex"
with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex
obj.at_time("00:00")
@pytest.mark.parametrize("axis", ["index", "columns", 0, 1])
def test_at_time_axis(self, axis):
# issue 8839
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
ts = DataFrame(np.random.default_rng(2).standard_normal((len(rng), len(rng))))
ts.index, ts.columns = rng, rng
indices = rng[(rng.hour == 9) & (rng.minute == 30) & (rng.second == 0)]
if axis in ["index", 0]:
expected = ts.loc[indices, :]
elif axis in ["columns", 1]:
expected = ts.loc[:, indices]
result = ts.at_time("9:30", axis=axis)
# Without clearing freq, result has freq 1440T and expected 5T
result.index = result.index._with_freq(None)
expected.index = expected.index._with_freq(None)
tm.assert_frame_equal(result, expected)
def test_at_time_datetimeindex(self):
index = date_range("2012-01-01", "2012-01-05", freq="30min")
df = DataFrame(
np.random.default_rng(2).standard_normal((len(index), 5)), index=index
)
akey = time(12, 0, 0)
ainds = [24, 72, 120, 168]
result = df.at_time(akey)
expected = df.loc[akey]
expected2 = df.iloc[ainds]
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected2)
assert len(result) == 4

View File

@ -0,0 +1,227 @@
from datetime import (
datetime,
time,
)
import numpy as np
import pytest
from pandas._libs.tslibs import timezones
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
Series,
date_range,
)
import pandas._testing as tm
class TestBetweenTime:
@td.skip_if_not_us_locale
def test_between_time_formats(self, frame_or_series):
# GH#11818
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
ts = DataFrame(
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
)
ts = tm.get_obj(ts, frame_or_series)
strings = [
("2:00", "2:30"),
("0200", "0230"),
("2:00am", "2:30am"),
("0200am", "0230am"),
("2:00:00", "2:30:00"),
("020000", "023000"),
("2:00:00am", "2:30:00am"),
("020000am", "023000am"),
]
expected_length = 28
for time_string in strings:
assert len(ts.between_time(*time_string)) == expected_length
@pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"])
def test_localized_between_time(self, tzstr, frame_or_series):
tz = timezones.maybe_get_tz(tzstr)
rng = date_range("4/16/2012", "5/1/2012", freq="h")
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
if frame_or_series is DataFrame:
ts = ts.to_frame()
ts_local = ts.tz_localize(tzstr)
t1, t2 = time(10, 0), time(11, 0)
result = ts_local.between_time(t1, t2)
expected = ts.between_time(t1, t2).tz_localize(tzstr)
tm.assert_equal(result, expected)
assert timezones.tz_compare(result.index.tz, tz)
def test_between_time_types(self, frame_or_series):
# GH11818
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
obj = DataFrame({"A": 0}, index=rng)
obj = tm.get_obj(obj, frame_or_series)
msg = r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\] to a time"
with pytest.raises(ValueError, match=msg):
obj.between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5))
def test_between_time(self, inclusive_endpoints_fixture, frame_or_series):
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
ts = DataFrame(
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
)
ts = tm.get_obj(ts, frame_or_series)
stime = time(0, 0)
etime = time(1, 0)
inclusive = inclusive_endpoints_fixture
filtered = ts.between_time(stime, etime, inclusive=inclusive)
exp_len = 13 * 4 + 1
if inclusive in ["right", "neither"]:
exp_len -= 5
if inclusive in ["left", "neither"]:
exp_len -= 4
assert len(filtered) == exp_len
for rs in filtered.index:
t = rs.time()
if inclusive in ["left", "both"]:
assert t >= stime
else:
assert t > stime
if inclusive in ["right", "both"]:
assert t <= etime
else:
assert t < etime
result = ts.between_time("00:00", "01:00")
expected = ts.between_time(stime, etime)
tm.assert_equal(result, expected)
# across midnight
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
ts = DataFrame(
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
)
ts = tm.get_obj(ts, frame_or_series)
stime = time(22, 0)
etime = time(9, 0)
filtered = ts.between_time(stime, etime, inclusive=inclusive)
exp_len = (12 * 11 + 1) * 4 + 1
if inclusive in ["right", "neither"]:
exp_len -= 4
if inclusive in ["left", "neither"]:
exp_len -= 4
assert len(filtered) == exp_len
for rs in filtered.index:
t = rs.time()
if inclusive in ["left", "both"]:
assert (t >= stime) or (t <= etime)
else:
assert (t > stime) or (t <= etime)
if inclusive in ["right", "both"]:
assert (t <= etime) or (t >= stime)
else:
assert (t < etime) or (t >= stime)
def test_between_time_raises(self, frame_or_series):
# GH#20725
obj = DataFrame([[1, 2, 3], [4, 5, 6]])
obj = tm.get_obj(obj, frame_or_series)
msg = "Index must be DatetimeIndex"
with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex
obj.between_time(start_time="00:00", end_time="12:00")
def test_between_time_axis(self, frame_or_series):
# GH#8839
rng = date_range("1/1/2000", periods=100, freq="10min")
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
if frame_or_series is DataFrame:
ts = ts.to_frame()
stime, etime = ("08:00:00", "09:00:00")
expected_length = 7
assert len(ts.between_time(stime, etime)) == expected_length
assert len(ts.between_time(stime, etime, axis=0)) == expected_length
msg = f"No axis named {ts.ndim} for object type {type(ts).__name__}"
with pytest.raises(ValueError, match=msg):
ts.between_time(stime, etime, axis=ts.ndim)
def test_between_time_axis_aliases(self, axis):
# GH#8839
rng = date_range("1/1/2000", periods=100, freq="10min")
ts = DataFrame(np.random.default_rng(2).standard_normal((len(rng), len(rng))))
stime, etime = ("08:00:00", "09:00:00")
exp_len = 7
if axis in ["index", 0]:
ts.index = rng
assert len(ts.between_time(stime, etime)) == exp_len
assert len(ts.between_time(stime, etime, axis=0)) == exp_len
if axis in ["columns", 1]:
ts.columns = rng
selected = ts.between_time(stime, etime, axis=1).columns
assert len(selected) == exp_len
def test_between_time_axis_raises(self, axis):
# issue 8839
rng = date_range("1/1/2000", periods=100, freq="10min")
mask = np.arange(0, len(rng))
rand_data = np.random.default_rng(2).standard_normal((len(rng), len(rng)))
ts = DataFrame(rand_data, index=rng, columns=rng)
stime, etime = ("08:00:00", "09:00:00")
msg = "Index must be DatetimeIndex"
if axis in ["columns", 1]:
ts.index = mask
with pytest.raises(TypeError, match=msg):
ts.between_time(stime, etime)
with pytest.raises(TypeError, match=msg):
ts.between_time(stime, etime, axis=0)
if axis in ["index", 0]:
ts.columns = mask
with pytest.raises(TypeError, match=msg):
ts.between_time(stime, etime, axis=1)
def test_between_time_datetimeindex(self):
index = date_range("2012-01-01", "2012-01-05", freq="30min")
df = DataFrame(
np.random.default_rng(2).standard_normal((len(index), 5)), index=index
)
bkey = slice(time(13, 0, 0), time(14, 0, 0))
binds = [26, 27, 28, 74, 75, 76, 122, 123, 124, 170, 171, 172]
result = df.between_time(bkey.start, bkey.stop)
expected = df.loc[bkey]
expected2 = df.iloc[binds]
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected2)
assert len(result) == 12
def test_between_time_incorrect_arg_inclusive(self):
# GH40245
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
ts = DataFrame(
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
)
stime = time(0, 0)
etime = time(1, 0)
inclusive = "bad_string"
msg = "Inclusive has to be either 'both', 'neither', 'left' or 'right'"
with pytest.raises(ValueError, match=msg):
ts.between_time(stime, etime, inclusive=inclusive)

View File

@ -0,0 +1,199 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
class TestDataFrameClip:
def test_clip(self, float_frame):
median = float_frame.median().median()
original = float_frame.copy()
double = float_frame.clip(upper=median, lower=median)
assert not (double.values != median).any()
# Verify that float_frame was not changed inplace
assert (float_frame.values == original.values).all()
def test_inplace_clip(self, float_frame):
# GH#15388
median = float_frame.median().median()
frame_copy = float_frame.copy()
return_value = frame_copy.clip(upper=median, lower=median, inplace=True)
assert return_value is None
assert not (frame_copy.values != median).any()
def test_dataframe_clip(self):
# GH#2747
df = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
for lb, ub in [(-1, 1), (1, -1)]:
clipped_df = df.clip(lb, ub)
lb, ub = min(lb, ub), max(ub, lb)
lb_mask = df.values <= lb
ub_mask = df.values >= ub
mask = ~lb_mask & ~ub_mask
assert (clipped_df.values[lb_mask] == lb).all()
assert (clipped_df.values[ub_mask] == ub).all()
assert (clipped_df.values[mask] == df.values[mask]).all()
def test_clip_mixed_numeric(self):
# clip on mixed integer or floats
# GH#24162, clipping now preserves numeric types per column
df = DataFrame({"A": [1, 2, 3], "B": [1.0, np.nan, 3.0]})
result = df.clip(1, 2)
expected = DataFrame({"A": [1, 2, 2], "B": [1.0, np.nan, 2.0]})
tm.assert_frame_equal(result, expected)
df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]], columns=["foo", "bar", "baz"])
expected = df.dtypes
result = df.clip(upper=3).dtypes
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("inplace", [True, False])
def test_clip_against_series(self, inplace):
# GH#6966
df = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
lb = Series(np.random.default_rng(2).standard_normal(1000))
ub = lb + 1
original = df.copy()
clipped_df = df.clip(lb, ub, axis=0, inplace=inplace)
if inplace:
clipped_df = df
for i in range(2):
lb_mask = original.iloc[:, i] <= lb
ub_mask = original.iloc[:, i] >= ub
mask = ~lb_mask & ~ub_mask
result = clipped_df.loc[lb_mask, i]
tm.assert_series_equal(result, lb[lb_mask], check_names=False)
assert result.name == i
result = clipped_df.loc[ub_mask, i]
tm.assert_series_equal(result, ub[ub_mask], check_names=False)
assert result.name == i
tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i])
@pytest.mark.parametrize("inplace", [True, False])
@pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])])
@pytest.mark.parametrize(
"axis,res",
[
(0, [[2.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 7.0, 7.0]]),
(1, [[2.0, 3.0, 4.0], [4.0, 5.0, 6.0], [5.0, 6.0, 7.0]]),
],
)
def test_clip_against_list_like(self, inplace, lower, axis, res):
# GH#15390
arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
original = DataFrame(
arr, columns=["one", "two", "three"], index=["a", "b", "c"]
)
result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace)
expected = DataFrame(res, columns=original.columns, index=original.index)
if inplace:
result = original
tm.assert_frame_equal(result, expected, check_exact=True)
@pytest.mark.parametrize("axis", [0, 1, None])
def test_clip_against_frame(self, axis):
df = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
lb = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
ub = lb + 1
clipped_df = df.clip(lb, ub, axis=axis)
lb_mask = df <= lb
ub_mask = df >= ub
mask = ~lb_mask & ~ub_mask
tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask])
tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask])
tm.assert_frame_equal(clipped_df[mask], df[mask])
def test_clip_against_unordered_columns(self):
# GH#20911
df1 = DataFrame(
np.random.default_rng(2).standard_normal((1000, 4)),
columns=["A", "B", "C", "D"],
)
df2 = DataFrame(
np.random.default_rng(2).standard_normal((1000, 4)),
columns=["D", "A", "B", "C"],
)
df3 = DataFrame(df2.values - 1, columns=["B", "D", "C", "A"])
result_upper = df1.clip(lower=0, upper=df2)
expected_upper = df1.clip(lower=0, upper=df2[df1.columns])
result_lower = df1.clip(lower=df3, upper=3)
expected_lower = df1.clip(lower=df3[df1.columns], upper=3)
result_lower_upper = df1.clip(lower=df3, upper=df2)
expected_lower_upper = df1.clip(lower=df3[df1.columns], upper=df2[df1.columns])
tm.assert_frame_equal(result_upper, expected_upper)
tm.assert_frame_equal(result_lower, expected_lower)
tm.assert_frame_equal(result_lower_upper, expected_lower_upper)
def test_clip_with_na_args(self, float_frame):
"""Should process np.nan argument as None"""
# GH#17276
tm.assert_frame_equal(float_frame.clip(np.nan), float_frame)
tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame)
# GH#19992 and adjusted in GH#40420
df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]})
msg = "Downcasting behavior in Series and DataFrame methods 'where'"
# TODO: avoid this warning here? seems like we should never be upcasting
# in the first place?
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.clip(lower=[4, 5, np.nan], axis=0)
expected = DataFrame(
{"col_0": [4, 5, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}
)
tm.assert_frame_equal(result, expected)
result = df.clip(lower=[4, 5, np.nan], axis=1)
expected = DataFrame(
{"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [7, 8, 9]}
)
tm.assert_frame_equal(result, expected)
# GH#40420
data = {"col_0": [9, -3, 0, -1, 5], "col_1": [-2, -7, 6, 8, -5]}
df = DataFrame(data)
t = Series([2, -4, np.nan, 6, 3])
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.clip(lower=t, axis=0)
expected = DataFrame({"col_0": [9, -3, 0, 6, 5], "col_1": [2, -4, 6, 8, 3]})
tm.assert_frame_equal(result, expected)
def test_clip_int_data_with_float_bound(self):
# GH51472
df = DataFrame({"a": [1, 2, 3]})
result = df.clip(lower=1.5)
expected = DataFrame({"a": [1.5, 2.0, 3.0]})
tm.assert_frame_equal(result, expected)
def test_clip_with_list_bound(self):
# GH#54817
df = DataFrame([1, 5])
expected = DataFrame([3, 5])
result = df.clip([3])
tm.assert_frame_equal(result, expected)
expected = DataFrame([1, 3])
result = df.clip(upper=[3])
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,47 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
class TestCombine:
@pytest.mark.parametrize(
"data",
[
pd.date_range("2000", periods=4),
pd.date_range("2000", periods=4, tz="US/Central"),
pd.period_range("2000", periods=4),
pd.timedelta_range(0, periods=4),
],
)
def test_combine_datetlike_udf(self, data):
# GH#23079
df = pd.DataFrame({"A": data})
other = df.copy()
df.iloc[1, 0] = None
def combiner(a, b):
return b
result = df.combine(other, combiner)
tm.assert_frame_equal(result, other)
def test_combine_generic(self, float_frame):
df1 = float_frame
df2 = float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]]
combined = df1.combine(df2, np.add)
combined2 = df2.combine(df1, np.add)
assert combined["D"].isna().all()
assert combined2["D"].isna().all()
chunk = combined.loc[combined.index[:-5], ["A", "B", "C"]]
chunk2 = combined2.loc[combined2.index[:-5], ["A", "B", "C"]]
exp = (
float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]].reindex_like(chunk)
* 2
)
tm.assert_frame_equal(chunk, exp)
tm.assert_frame_equal(chunk2, exp)

View File

@ -0,0 +1,556 @@
from datetime import datetime
import numpy as np
import pytest
from pandas.core.dtypes.cast import find_common_type
from pandas.core.dtypes.common import is_dtype_equal
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
)
import pandas._testing as tm
class TestDataFrameCombineFirst:
def test_combine_first_mixed(self):
a = Series(["a", "b"], index=range(2))
b = Series(range(2), index=range(2))
f = DataFrame({"A": a, "B": b})
a = Series(["a", "b"], index=range(5, 7))
b = Series(range(2), index=range(5, 7))
g = DataFrame({"A": a, "B": b})
exp = DataFrame({"A": list("abab"), "B": [0, 1, 0, 1]}, index=[0, 1, 5, 6])
combined = f.combine_first(g)
tm.assert_frame_equal(combined, exp)
def test_combine_first(self, float_frame, using_infer_string):
# disjoint
head, tail = float_frame[:5], float_frame[5:]
combined = head.combine_first(tail)
reordered_frame = float_frame.reindex(combined.index)
tm.assert_frame_equal(combined, reordered_frame)
tm.assert_index_equal(combined.columns, float_frame.columns)
tm.assert_series_equal(combined["A"], reordered_frame["A"])
# same index
fcopy = float_frame.copy()
fcopy["A"] = 1
del fcopy["C"]
fcopy2 = float_frame.copy()
fcopy2["B"] = 0
del fcopy2["D"]
combined = fcopy.combine_first(fcopy2)
assert (combined["A"] == 1).all()
tm.assert_series_equal(combined["B"], fcopy["B"])
tm.assert_series_equal(combined["C"], fcopy2["C"])
tm.assert_series_equal(combined["D"], fcopy["D"])
# overlap
head, tail = reordered_frame[:10].copy(), reordered_frame
head["A"] = 1
combined = head.combine_first(tail)
assert (combined["A"][:10] == 1).all()
# reverse overlap
tail.iloc[:10, tail.columns.get_loc("A")] = 0
combined = tail.combine_first(head)
assert (combined["A"][:10] == 0).all()
# no overlap
f = float_frame[:10]
g = float_frame[10:]
combined = f.combine_first(g)
tm.assert_series_equal(combined["A"].reindex(f.index), f["A"])
tm.assert_series_equal(combined["A"].reindex(g.index), g["A"])
# corner cases
warning = FutureWarning if using_infer_string else None
with tm.assert_produces_warning(warning, match="empty entries"):
comb = float_frame.combine_first(DataFrame())
tm.assert_frame_equal(comb, float_frame)
comb = DataFrame().combine_first(float_frame)
tm.assert_frame_equal(comb, float_frame.sort_index())
comb = float_frame.combine_first(DataFrame(index=["faz", "boo"]))
assert "faz" in comb.index
# #2525
df = DataFrame({"a": [1]}, index=[datetime(2012, 1, 1)])
df2 = DataFrame(columns=["b"])
result = df.combine_first(df2)
assert "b" in result
def test_combine_first_mixed_bug(self):
idx = Index(["a", "b", "c", "e"])
ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
ser2 = Series(["a", "b", "c", "e"], index=idx)
ser3 = Series([12, 4, 5, 97], index=idx)
frame1 = DataFrame({"col0": ser1, "col2": ser2, "col3": ser3})
idx = Index(["a", "b", "c", "f"])
ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
ser2 = Series(["a", "b", "c", "f"], index=idx)
ser3 = Series([12, 4, 5, 97], index=idx)
frame2 = DataFrame({"col1": ser1, "col2": ser2, "col5": ser3})
combined = frame1.combine_first(frame2)
assert len(combined.columns) == 5
def test_combine_first_same_as_in_update(self):
# gh 3016 (same as in update)
df = DataFrame(
[[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
columns=["A", "B", "bool1", "bool2"],
)
other = DataFrame([[45, 45]], index=[0], columns=["A", "B"])
result = df.combine_first(other)
tm.assert_frame_equal(result, df)
df.loc[0, "A"] = np.nan
result = df.combine_first(other)
df.loc[0, "A"] = 45
tm.assert_frame_equal(result, df)
def test_combine_first_doc_example(self):
# doc example
df1 = DataFrame(
{"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}
)
df2 = DataFrame(
{
"A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],
"B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],
}
)
result = df1.combine_first(df2)
expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]})
tm.assert_frame_equal(result, expected)
def test_combine_first_return_obj_type_with_bools(self):
# GH3552
df1 = DataFrame(
[[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]]
)
df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2])
expected = Series([True, True, False], name=2, dtype=bool)
result_12 = df1.combine_first(df2)[2]
tm.assert_series_equal(result_12, expected)
result_21 = df2.combine_first(df1)[2]
tm.assert_series_equal(result_21, expected)
@pytest.mark.parametrize(
"data1, data2, data_expected",
(
(
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
[pd.NaT, pd.NaT, pd.NaT],
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
),
(
[pd.NaT, pd.NaT, pd.NaT],
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
),
(
[datetime(2000, 1, 2), pd.NaT, pd.NaT],
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
[datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 3)],
),
(
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
[datetime(2000, 1, 2), pd.NaT, pd.NaT],
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
),
),
)
def test_combine_first_convert_datatime_correctly(
self, data1, data2, data_expected
):
# GH 3593
df1, df2 = DataFrame({"a": data1}), DataFrame({"a": data2})
result = df1.combine_first(df2)
expected = DataFrame({"a": data_expected})
tm.assert_frame_equal(result, expected)
def test_combine_first_align_nan(self):
# GH 7509 (not fixed)
dfa = DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"])
dfb = DataFrame([[4], [5]], columns=["b"])
assert dfa["a"].dtype == "datetime64[ns]"
assert dfa["b"].dtype == "int64"
res = dfa.combine_first(dfb)
exp = DataFrame(
{"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2, 5]},
columns=["a", "b"],
)
tm.assert_frame_equal(res, exp)
assert res["a"].dtype == "datetime64[ns]"
# TODO: this must be int64
assert res["b"].dtype == "int64"
res = dfa.iloc[:0].combine_first(dfb)
exp = DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"])
tm.assert_frame_equal(res, exp)
# TODO: this must be datetime64
assert res["a"].dtype == "float64"
# TODO: this must be int64
assert res["b"].dtype == "int64"
def test_combine_first_timezone(self, unit):
# see gh-7630
data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC").as_unit(unit)
df1 = DataFrame(
columns=["UTCdatetime", "abc"],
data=data1,
index=pd.date_range("20140627", periods=1),
)
data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC").as_unit(unit)
df2 = DataFrame(
columns=["UTCdatetime", "xyz"],
data=data2,
index=pd.date_range("20140628", periods=1),
)
res = df2[["UTCdatetime"]].combine_first(df1)
exp = DataFrame(
{
"UTCdatetime": [
pd.Timestamp("2010-01-01 01:01", tz="UTC"),
pd.Timestamp("2012-12-12 12:12", tz="UTC"),
],
"abc": [pd.Timestamp("2010-01-01 01:01:00", tz="UTC"), pd.NaT],
},
columns=["UTCdatetime", "abc"],
index=pd.date_range("20140627", periods=2, freq="D"),
dtype=f"datetime64[{unit}, UTC]",
)
assert res["UTCdatetime"].dtype == f"datetime64[{unit}, UTC]"
assert res["abc"].dtype == f"datetime64[{unit}, UTC]"
tm.assert_frame_equal(res, exp)
def test_combine_first_timezone2(self, unit):
# see gh-10567
dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC", unit=unit)
df1 = DataFrame({"DATE": dts1})
dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC", unit=unit)
df2 = DataFrame({"DATE": dts2})
res = df1.combine_first(df2)
tm.assert_frame_equal(res, df1)
assert res["DATE"].dtype == f"datetime64[{unit}, UTC]"
def test_combine_first_timezone3(self, unit):
dts1 = pd.DatetimeIndex(
["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern"
).as_unit(unit)
df1 = DataFrame({"DATE": dts1}, index=[1, 3, 5, 7])
dts2 = pd.DatetimeIndex(
["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern"
).as_unit(unit)
df2 = DataFrame({"DATE": dts2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = pd.DatetimeIndex(
[
"2011-01-01",
"2012-01-01",
"NaT",
"2012-01-02",
"2011-01-03",
"2011-01-04",
],
tz="US/Eastern",
).as_unit(unit)
exp = DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
# FIXME: parametrizing over unit breaks on non-nano
def test_combine_first_timezone4(self):
# different tz
dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern")
df1 = DataFrame({"DATE": dts1})
dts2 = pd.date_range("2015-01-03", "2015-01-05")
df2 = DataFrame({"DATE": dts2})
# if df1 doesn't have NaN, keep its dtype
res = df1.combine_first(df2)
tm.assert_frame_equal(res, df1)
assert res["DATE"].dtype == "datetime64[ns, US/Eastern]"
def test_combine_first_timezone5(self, unit):
dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern", unit=unit)
df1 = DataFrame({"DATE": dts1})
dts2 = pd.date_range("2015-01-01", "2015-01-03", unit=unit)
df2 = DataFrame({"DATE": dts2})
res = df1.combine_first(df2)
exp_dts = [
pd.Timestamp("2015-01-01", tz="US/Eastern"),
pd.Timestamp("2015-01-02", tz="US/Eastern"),
pd.Timestamp("2015-01-03"),
]
exp = DataFrame({"DATE": exp_dts})
tm.assert_frame_equal(res, exp)
assert res["DATE"].dtype == "object"
def test_combine_first_timedelta(self):
data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"])
df1 = DataFrame({"TD": data1}, index=[1, 3, 5, 7])
data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"])
df2 = DataFrame({"TD": data2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = pd.TimedeltaIndex(
["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"]
)
exp = DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
assert res["TD"].dtype == "timedelta64[ns]"
def test_combine_first_period(self):
data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M")
df1 = DataFrame({"P": data1}, index=[1, 3, 5, 7])
data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M")
df2 = DataFrame({"P": data2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = pd.PeriodIndex(
["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M"
)
exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
assert res["P"].dtype == data1.dtype
# different freq
dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D")
df2 = DataFrame({"P": dts2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = [
pd.Period("2011-01", freq="M"),
pd.Period("2012-01-01", freq="D"),
pd.NaT,
pd.Period("2012-01-02", freq="D"),
pd.Period("2011-03", freq="M"),
pd.Period("2011-04", freq="M"),
]
exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
assert res["P"].dtype == "object"
def test_combine_first_int(self):
# GH14687 - integer series that do no align exactly
df1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="int64")
df2 = DataFrame({"a": [1, 4]}, dtype="int64")
result_12 = df1.combine_first(df2)
expected_12 = DataFrame({"a": [0, 1, 3, 5]})
tm.assert_frame_equal(result_12, expected_12)
result_21 = df2.combine_first(df1)
expected_21 = DataFrame({"a": [1, 4, 3, 5]})
tm.assert_frame_equal(result_21, expected_21)
@pytest.mark.parametrize("val", [1, 1.0])
def test_combine_first_with_asymmetric_other(self, val):
# see gh-20699
df1 = DataFrame({"isNum": [val]})
df2 = DataFrame({"isBool": [True]})
res = df1.combine_first(df2)
exp = DataFrame({"isBool": [True], "isNum": [val]})
tm.assert_frame_equal(res, exp)
def test_combine_first_string_dtype_only_na(self, nullable_string_dtype):
# GH: 37519
df = DataFrame(
{"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype
)
df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype=nullable_string_dtype)
df.set_index(["a", "b"], inplace=True)
df2.set_index(["a", "b"], inplace=True)
result = df.combine_first(df2)
expected = DataFrame(
{"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype
).set_index(["a", "b"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"scalar1, scalar2",
[
(datetime(2020, 1, 1), datetime(2020, 1, 2)),
(pd.Period("2020-01-01", "D"), pd.Period("2020-01-02", "D")),
(pd.Timedelta("89 days"), pd.Timedelta("60 min")),
(pd.Interval(left=0, right=1), pd.Interval(left=2, right=3, closed="left")),
],
)
def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture):
# GH28481
na_value = nulls_fixture
frame = DataFrame([[na_value, na_value]], columns=["a", "b"])
other = DataFrame([[scalar1, scalar2]], columns=["b", "c"])
common_dtype = find_common_type([frame.dtypes["b"], other.dtypes["b"]])
if is_dtype_equal(common_dtype, "object") or frame.dtypes["b"] == other.dtypes["b"]:
val = scalar1
else:
val = na_value
result = frame.combine_first(other)
expected = DataFrame([[na_value, val, scalar2]], columns=["a", "b", "c"])
expected["b"] = expected["b"].astype(common_dtype)
tm.assert_frame_equal(result, expected)
def test_combine_first_timestamp_bug_NaT():
# GH28481
frame = DataFrame([[pd.NaT, pd.NaT]], columns=["a", "b"])
other = DataFrame(
[[datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["b", "c"]
)
result = frame.combine_first(other)
expected = DataFrame(
[[pd.NaT, datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["a", "b", "c"]
)
tm.assert_frame_equal(result, expected)
def test_combine_first_with_nan_multiindex():
# gh-36562
mi1 = MultiIndex.from_arrays(
[["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"]
)
df = DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1)
mi2 = MultiIndex.from_arrays(
[["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"]
)
s = Series([1, 2, 3, 4, 5, 6], index=mi2)
res = df.combine_first(DataFrame({"d": s}))
mi_expected = MultiIndex.from_arrays(
[
["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan],
[1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6],
],
names=["a", "b"],
)
expected = DataFrame(
{
"c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1],
"d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan],
},
index=mi_expected,
)
tm.assert_frame_equal(res, expected)
def test_combine_preserve_dtypes():
# GH7509
a_column = Series(["a", "b"], index=range(2))
b_column = Series(range(2), index=range(2))
df1 = DataFrame({"A": a_column, "B": b_column})
c_column = Series(["a", "b"], index=range(5, 7))
b_column = Series(range(-1, 1), index=range(5, 7))
df2 = DataFrame({"B": b_column, "C": c_column})
expected = DataFrame(
{
"A": ["a", "b", np.nan, np.nan],
"B": [0, 1, -1, 0],
"C": [np.nan, np.nan, "a", "b"],
},
index=[0, 1, 5, 6],
)
combined = df1.combine_first(df2)
tm.assert_frame_equal(combined, expected)
def test_combine_first_duplicates_rows_for_nan_index_values():
# GH39881
df1 = DataFrame(
{"x": [9, 10, 11]},
index=MultiIndex.from_arrays([[1, 2, 3], [np.nan, 5, 6]], names=["a", "b"]),
)
df2 = DataFrame(
{"y": [12, 13, 14]},
index=MultiIndex.from_arrays([[1, 2, 4], [np.nan, 5, 7]], names=["a", "b"]),
)
expected = DataFrame(
{
"x": [9.0, 10.0, 11.0, np.nan],
"y": [12.0, 13.0, np.nan, 14.0],
},
index=MultiIndex.from_arrays(
[[1, 2, 3, 4], [np.nan, 5, 6, 7]], names=["a", "b"]
),
)
combined = df1.combine_first(df2)
tm.assert_frame_equal(combined, expected)
def test_combine_first_int64_not_cast_to_float64():
# GH 28613
df_1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df_2 = DataFrame({"A": [1, 20, 30], "B": [40, 50, 60], "C": [12, 34, 65]})
result = df_1.combine_first(df_2)
expected = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [12, 34, 65]})
tm.assert_frame_equal(result, expected)
def test_midx_losing_dtype():
# GH#49830
midx = MultiIndex.from_arrays([[0, 0], [np.nan, np.nan]])
midx2 = MultiIndex.from_arrays([[1, 1], [np.nan, np.nan]])
df1 = DataFrame({"a": [None, 4]}, index=midx)
df2 = DataFrame({"a": [3, 3]}, index=midx2)
result = df1.combine_first(df2)
expected_midx = MultiIndex.from_arrays(
[[0, 0, 1, 1], [np.nan, np.nan, np.nan, np.nan]]
)
expected = DataFrame({"a": [np.nan, 4, 3, 3]}, index=expected_midx)
tm.assert_frame_equal(result, expected)
def test_combine_first_empty_columns():
left = DataFrame(columns=["a", "b"])
right = DataFrame(columns=["a", "c"])
result = left.combine_first(right)
expected = DataFrame(columns=["a", "b", "c"])
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,305 @@
import numpy as np
import pytest
from pandas.compat.numpy import np_version_gte1p25
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize("align_axis", [0, 1, "index", "columns"])
def test_compare_axis(align_axis):
# GH#30429
df = pd.DataFrame(
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
columns=["col1", "col2", "col3"],
)
df2 = df.copy()
df2.loc[0, "col1"] = "c"
df2.loc[2, "col3"] = 4.0
result = df.compare(df2, align_axis=align_axis)
if align_axis in (1, "columns"):
indices = pd.Index([0, 2])
columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
expected = pd.DataFrame(
[["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]],
index=indices,
columns=columns,
)
else:
indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]])
columns = pd.Index(["col1", "col3"])
expected = pd.DataFrame(
[["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]],
index=indices,
columns=columns,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"keep_shape, keep_equal",
[
(True, False),
(False, True),
(True, True),
# False, False case is already covered in test_compare_axis
],
)
def test_compare_various_formats(keep_shape, keep_equal):
df = pd.DataFrame(
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
columns=["col1", "col2", "col3"],
)
df2 = df.copy()
df2.loc[0, "col1"] = "c"
df2.loc[2, "col3"] = 4.0
result = df.compare(df2, keep_shape=keep_shape, keep_equal=keep_equal)
if keep_shape:
indices = pd.Index([0, 1, 2])
columns = pd.MultiIndex.from_product(
[["col1", "col2", "col3"], ["self", "other"]]
)
if keep_equal:
expected = pd.DataFrame(
[
["a", "c", 1.0, 1.0, 1.0, 1.0],
["b", "b", 2.0, 2.0, 2.0, 2.0],
["c", "c", np.nan, np.nan, 3.0, 4.0],
],
index=indices,
columns=columns,
)
else:
expected = pd.DataFrame(
[
["a", "c", np.nan, np.nan, np.nan, np.nan],
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
[np.nan, np.nan, np.nan, np.nan, 3.0, 4.0],
],
index=indices,
columns=columns,
)
else:
indices = pd.Index([0, 2])
columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
expected = pd.DataFrame(
[["a", "c", 1.0, 1.0], ["c", "c", 3.0, 4.0]], index=indices, columns=columns
)
tm.assert_frame_equal(result, expected)
def test_compare_with_equal_nulls():
# We want to make sure two NaNs are considered the same
# and dropped where applicable
df = pd.DataFrame(
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
columns=["col1", "col2", "col3"],
)
df2 = df.copy()
df2.loc[0, "col1"] = "c"
result = df.compare(df2)
indices = pd.Index([0])
columns = pd.MultiIndex.from_product([["col1"], ["self", "other"]])
expected = pd.DataFrame([["a", "c"]], index=indices, columns=columns)
tm.assert_frame_equal(result, expected)
def test_compare_with_non_equal_nulls():
# We want to make sure the relevant NaNs do not get dropped
# even if the entire row or column are NaNs
df = pd.DataFrame(
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
columns=["col1", "col2", "col3"],
)
df2 = df.copy()
df2.loc[0, "col1"] = "c"
df2.loc[2, "col3"] = np.nan
result = df.compare(df2)
indices = pd.Index([0, 2])
columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
expected = pd.DataFrame(
[["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan]],
index=indices,
columns=columns,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("align_axis", [0, 1])
def test_compare_multi_index(align_axis):
df = pd.DataFrame(
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}
)
df.columns = pd.MultiIndex.from_arrays([["a", "a", "b"], ["col1", "col2", "col3"]])
df.index = pd.MultiIndex.from_arrays([["x", "x", "y"], [0, 1, 2]])
df2 = df.copy()
df2.iloc[0, 0] = "c"
df2.iloc[2, 2] = 4.0
result = df.compare(df2, align_axis=align_axis)
if align_axis == 0:
indices = pd.MultiIndex.from_arrays(
[["x", "x", "y", "y"], [0, 0, 2, 2], ["self", "other", "self", "other"]]
)
columns = pd.MultiIndex.from_arrays([["a", "b"], ["col1", "col3"]])
data = [["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]]
else:
indices = pd.MultiIndex.from_arrays([["x", "y"], [0, 2]])
columns = pd.MultiIndex.from_arrays(
[
["a", "a", "b", "b"],
["col1", "col1", "col3", "col3"],
["self", "other", "self", "other"],
]
)
data = [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]]
expected = pd.DataFrame(data=data, index=indices, columns=columns)
tm.assert_frame_equal(result, expected)
def test_compare_unaligned_objects():
# test DataFrames with different indices
msg = (
r"Can only compare identically-labeled \(both index and columns\) DataFrame "
"objects"
)
with pytest.raises(ValueError, match=msg):
df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"])
df2 = pd.DataFrame([1, 2, 3], index=["a", "b", "d"])
df1.compare(df2)
# test DataFrames with different shapes
msg = (
r"Can only compare identically-labeled \(both index and columns\) DataFrame "
"objects"
)
with pytest.raises(ValueError, match=msg):
df1 = pd.DataFrame(np.ones((3, 3)))
df2 = pd.DataFrame(np.zeros((2, 1)))
df1.compare(df2)
def test_compare_result_names():
# GH 44354
df1 = pd.DataFrame(
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
)
df2 = pd.DataFrame(
{
"col1": ["c", "b", "c"],
"col2": [1.0, 2.0, np.nan],
"col3": [1.0, 2.0, np.nan],
},
)
result = df1.compare(df2, result_names=("left", "right"))
expected = pd.DataFrame(
{
("col1", "left"): {0: "a", 2: np.nan},
("col1", "right"): {0: "c", 2: np.nan},
("col3", "left"): {0: np.nan, 2: 3.0},
("col3", "right"): {0: np.nan, 2: np.nan},
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"result_names",
[
[1, 2],
"HK",
{"2": 2, "3": 3},
3,
3.0,
],
)
def test_invalid_input_result_names(result_names):
# GH 44354
df1 = pd.DataFrame(
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
)
df2 = pd.DataFrame(
{
"col1": ["c", "b", "c"],
"col2": [1.0, 2.0, np.nan],
"col3": [1.0, 2.0, np.nan],
},
)
with pytest.raises(
TypeError,
match=(
f"Passing 'result_names' as a {type(result_names)} is not "
"supported. Provide 'result_names' as a tuple instead."
),
):
df1.compare(df2, result_names=result_names)
@pytest.mark.parametrize(
"val1,val2",
[(4, pd.NA), (pd.NA, pd.NA), (pd.NA, 4)],
)
def test_compare_ea_and_np_dtype(val1, val2):
# GH 48966
arr = [4.0, val1]
ser = pd.Series([1, val2], dtype="Int64")
df1 = pd.DataFrame({"a": arr, "b": [1.0, 2]})
df2 = pd.DataFrame({"a": ser, "b": [1.0, 2]})
expected = pd.DataFrame(
{
("a", "self"): arr,
("a", "other"): ser,
("b", "self"): np.nan,
("b", "other"): np.nan,
}
)
if val1 is pd.NA and val2 is pd.NA:
# GH#18463 TODO: is this really the desired behavior?
expected.loc[1, ("a", "self")] = np.nan
if val1 is pd.NA and np_version_gte1p25:
# can't compare with numpy array if it contains pd.NA
with pytest.raises(TypeError, match="boolean value of NA is ambiguous"):
result = df1.compare(df2, keep_shape=True)
else:
result = df1.compare(df2, keep_shape=True)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"df1_val,df2_val,diff_self,diff_other",
[
(4, 3, 4, 3),
(4, 4, pd.NA, pd.NA),
(4, pd.NA, 4, pd.NA),
(pd.NA, pd.NA, pd.NA, pd.NA),
],
)
def test_compare_nullable_int64_dtype(df1_val, df2_val, diff_self, diff_other):
# GH 48966
df1 = pd.DataFrame({"a": pd.Series([df1_val, pd.NA], dtype="Int64"), "b": [1.0, 2]})
df2 = df1.copy()
df2.loc[0, "a"] = df2_val
expected = pd.DataFrame(
{
("a", "self"): pd.Series([diff_self, pd.NA], dtype="Int64"),
("a", "other"): pd.Series([diff_other, pd.NA], dtype="Int64"),
("b", "self"): np.nan,
("b", "other"): np.nan,
}
)
result = df1.compare(df2, keep_shape=True)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,202 @@
import datetime
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
class TestConvertDtypes:
@pytest.mark.parametrize(
"convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
)
def test_convert_dtypes(
self, convert_integer, expected, string_storage, using_infer_string
):
# Specific types are tested in tests/series/test_dtypes.py
# Just check that it works for DataFrame here
if using_infer_string:
string_storage = "pyarrow_numpy"
df = pd.DataFrame(
{
"a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
"b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
}
)
with pd.option_context("string_storage", string_storage):
result = df.convert_dtypes(True, True, convert_integer, False)
expected = pd.DataFrame(
{
"a": pd.Series([1, 2, 3], dtype=expected),
"b": pd.Series(["x", "y", "z"], dtype=f"string[{string_storage}]"),
}
)
tm.assert_frame_equal(result, expected)
def test_convert_empty(self):
# Empty DataFrame can pass convert_dtypes, see GH#40393
empty_df = pd.DataFrame()
tm.assert_frame_equal(empty_df, empty_df.convert_dtypes())
def test_convert_dtypes_retain_column_names(self):
# GH#41435
df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
df.columns.name = "cols"
result = df.convert_dtypes()
tm.assert_index_equal(result.columns, df.columns)
assert result.columns.name == "cols"
def test_pyarrow_dtype_backend(self):
pa = pytest.importorskip("pyarrow")
df = pd.DataFrame(
{
"a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
"b": pd.Series(["x", "y", None], dtype=np.dtype("O")),
"c": pd.Series([True, False, None], dtype=np.dtype("O")),
"d": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
"e": pd.Series(pd.date_range("2022", periods=3)),
"f": pd.Series(pd.date_range("2022", periods=3, tz="UTC").as_unit("s")),
"g": pd.Series(pd.timedelta_range("1D", periods=3)),
}
)
result = df.convert_dtypes(dtype_backend="pyarrow")
expected = pd.DataFrame(
{
"a": pd.arrays.ArrowExtensionArray(
pa.array([1, 2, 3], type=pa.int32())
),
"b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])),
"c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])),
"d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])),
"e": pd.arrays.ArrowExtensionArray(
pa.array(
[
datetime.datetime(2022, 1, 1),
datetime.datetime(2022, 1, 2),
datetime.datetime(2022, 1, 3),
],
type=pa.timestamp(unit="ns"),
)
),
"f": pd.arrays.ArrowExtensionArray(
pa.array(
[
datetime.datetime(2022, 1, 1),
datetime.datetime(2022, 1, 2),
datetime.datetime(2022, 1, 3),
],
type=pa.timestamp(unit="s", tz="UTC"),
)
),
"g": pd.arrays.ArrowExtensionArray(
pa.array(
[
datetime.timedelta(1),
datetime.timedelta(2),
datetime.timedelta(3),
],
type=pa.duration("ns"),
)
),
}
)
tm.assert_frame_equal(result, expected)
def test_pyarrow_dtype_backend_already_pyarrow(self):
pytest.importorskip("pyarrow")
expected = pd.DataFrame([1, 2, 3], dtype="int64[pyarrow]")
result = expected.convert_dtypes(dtype_backend="pyarrow")
tm.assert_frame_equal(result, expected)
def test_pyarrow_dtype_backend_from_pandas_nullable(self):
pa = pytest.importorskip("pyarrow")
df = pd.DataFrame(
{
"a": pd.Series([1, 2, None], dtype="Int32"),
"b": pd.Series(["x", "y", None], dtype="string[python]"),
"c": pd.Series([True, False, None], dtype="boolean"),
"d": pd.Series([None, 100.5, 200], dtype="Float64"),
}
)
result = df.convert_dtypes(dtype_backend="pyarrow")
expected = pd.DataFrame(
{
"a": pd.arrays.ArrowExtensionArray(
pa.array([1, 2, None], type=pa.int32())
),
"b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])),
"c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])),
"d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])),
}
)
tm.assert_frame_equal(result, expected)
def test_pyarrow_dtype_empty_object(self):
# GH 50970
pytest.importorskip("pyarrow")
expected = pd.DataFrame(columns=[0])
result = expected.convert_dtypes(dtype_backend="pyarrow")
tm.assert_frame_equal(result, expected)
def test_pyarrow_engine_lines_false(self):
# GH 48893
df = pd.DataFrame({"a": [1, 2, 3]})
msg = (
"dtype_backend numpy is invalid, only 'numpy_nullable' and "
"'pyarrow' are allowed."
)
with pytest.raises(ValueError, match=msg):
df.convert_dtypes(dtype_backend="numpy")
def test_pyarrow_backend_no_conversion(self):
# GH#52872
pytest.importorskip("pyarrow")
df = pd.DataFrame({"a": [1, 2], "b": 1.5, "c": True, "d": "x"})
expected = df.copy()
result = df.convert_dtypes(
convert_floating=False,
convert_integer=False,
convert_boolean=False,
convert_string=False,
dtype_backend="pyarrow",
)
tm.assert_frame_equal(result, expected)
def test_convert_dtypes_pyarrow_to_np_nullable(self):
# GH 53648
pytest.importorskip("pyarrow")
ser = pd.DataFrame(range(2), dtype="int32[pyarrow]")
result = ser.convert_dtypes(dtype_backend="numpy_nullable")
expected = pd.DataFrame(range(2), dtype="Int32")
tm.assert_frame_equal(result, expected)
def test_convert_dtypes_pyarrow_timestamp(self):
# GH 54191
pytest.importorskip("pyarrow")
ser = pd.Series(pd.date_range("2020-01-01", "2020-01-02", freq="1min"))
expected = ser.astype("timestamp[ms][pyarrow]")
result = expected.convert_dtypes(dtype_backend="pyarrow")
tm.assert_series_equal(result, expected)
def test_convert_dtypes_avoid_block_splitting(self):
# GH#55341
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"})
result = df.convert_dtypes(convert_integer=False)
expected = pd.DataFrame(
{
"a": [1, 2, 3],
"b": [4, 5, 6],
"c": pd.Series(["a"] * 3, dtype="string[python]"),
}
)
tm.assert_frame_equal(result, expected)
assert result._mgr.nblocks == 2
def test_convert_dtypes_from_arrow(self):
# GH#56581
df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"])
result = df.convert_dtypes()
expected = df.astype({"a": "string[python]"})
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,64 @@
import numpy as np
import pytest
import pandas.util._test_decorators as td
from pandas import DataFrame
import pandas._testing as tm
class TestCopy:
@pytest.mark.parametrize("attr", ["index", "columns"])
def test_copy_index_name_checking(self, float_frame, attr):
# don't want to be able to modify the index stored elsewhere after
# making a copy
ind = getattr(float_frame, attr)
ind.name = None
cp = float_frame.copy()
getattr(cp, attr).name = "foo"
assert getattr(float_frame, attr).name is None
@td.skip_copy_on_write_invalid_test
def test_copy_cache(self):
# GH#31784 _item_cache not cleared on copy causes incorrect reads after updates
df = DataFrame({"a": [1]})
df["x"] = [0]
df["a"]
df.copy()
df["a"].values[0] = -1
tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0]}))
df["y"] = [0]
assert df["a"].values[0] == -1
tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0], "y": [0]}))
def test_copy(self, float_frame, float_string_frame):
cop = float_frame.copy()
cop["E"] = cop["A"]
assert "E" not in float_frame
# copy objects
copy = float_string_frame.copy()
assert copy._mgr is not float_string_frame._mgr
@td.skip_array_manager_invalid_test
def test_copy_consolidates(self):
# GH#42477
df = DataFrame(
{
"a": np.random.default_rng(2).integers(0, 100, size=55),
"b": np.random.default_rng(2).integers(0, 100, size=55),
}
)
for i in range(10):
df.loc[:, f"n_{i}"] = np.random.default_rng(2).integers(0, 100, size=55)
assert len(df._mgr.blocks) == 11
result = df.copy()
assert len(result._mgr.blocks) == 1

View File

@ -0,0 +1,39 @@
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
class TestDataFrameCount:
def test_count(self):
# corner case
frame = DataFrame()
ct1 = frame.count(1)
assert isinstance(ct1, Series)
ct2 = frame.count(0)
assert isinstance(ct2, Series)
# GH#423
df = DataFrame(index=range(10))
result = df.count(1)
expected = Series(0, index=df.index)
tm.assert_series_equal(result, expected)
df = DataFrame(columns=range(10))
result = df.count(0)
expected = Series(0, index=df.columns)
tm.assert_series_equal(result, expected)
df = DataFrame()
result = df.count()
expected = Series(dtype="int64")
tm.assert_series_equal(result, expected)
def test_count_objects(self, float_string_frame):
dm = DataFrame(float_string_frame._series)
df = DataFrame(float_string_frame._series)
tm.assert_series_equal(dm.count(), df.count())
tm.assert_series_equal(dm.count(1), df.count(1))

View File

@ -0,0 +1,471 @@
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
DataFrame,
Index,
Series,
date_range,
isna,
)
import pandas._testing as tm
class TestDataFrameCov:
def test_cov(self, float_frame, float_string_frame):
# min_periods no NAs (corner case)
expected = float_frame.cov()
result = float_frame.cov(min_periods=len(float_frame))
tm.assert_frame_equal(expected, result)
result = float_frame.cov(min_periods=len(float_frame) + 1)
assert isna(result.values).all()
# with NAs
frame = float_frame.copy()
frame.iloc[:5, frame.columns.get_loc("A")] = np.nan
frame.iloc[5:10, frame.columns.get_loc("B")] = np.nan
result = frame.cov(min_periods=len(frame) - 8)
expected = frame.cov()
expected.loc["A", "B"] = np.nan
expected.loc["B", "A"] = np.nan
tm.assert_frame_equal(result, expected)
# regular
result = frame.cov()
expected = frame["A"].cov(frame["C"])
tm.assert_almost_equal(result["A"]["C"], expected)
# fails on non-numeric types
with pytest.raises(ValueError, match="could not convert string to float"):
float_string_frame.cov()
result = float_string_frame.cov(numeric_only=True)
expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov()
tm.assert_frame_equal(result, expected)
# Single column frame
df = DataFrame(np.linspace(0.0, 1.0, 10))
result = df.cov()
expected = DataFrame(
np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns
)
tm.assert_frame_equal(result, expected)
df.loc[0] = np.nan
result = df.cov()
expected = DataFrame(
np.cov(df.values[1:].T).reshape((1, 1)),
index=df.columns,
columns=df.columns,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("test_ddof", [None, 0, 1, 2, 3])
def test_cov_ddof(self, test_ddof):
# GH#34611
np_array1 = np.random.default_rng(2).random(10)
np_array2 = np.random.default_rng(2).random(10)
df = DataFrame({0: np_array1, 1: np_array2})
result = df.cov(ddof=test_ddof)
expected_np = np.cov(np_array1, np_array2, ddof=test_ddof)
expected = DataFrame(expected_np)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"other_column", [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])]
)
def test_cov_nullable_integer(self, other_column):
# https://github.com/pandas-dev/pandas/issues/33803
data = DataFrame({"a": pd.array([1, 2, None]), "b": other_column})
result = data.cov()
arr = np.array([[0.5, 0.5], [0.5, 1.0]])
expected = DataFrame(arr, columns=["a", "b"], index=["a", "b"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("numeric_only", [True, False])
def test_cov_numeric_only(self, numeric_only):
# when dtypes of pandas series are different
# then ndarray will have dtype=object,
# so it need to be properly handled
df = DataFrame({"a": [1, 0], "c": ["x", "y"]})
expected = DataFrame(0.5, index=["a"], columns=["a"])
if numeric_only:
result = df.cov(numeric_only=numeric_only)
tm.assert_frame_equal(result, expected)
else:
with pytest.raises(ValueError, match="could not convert string to float"):
df.cov(numeric_only=numeric_only)
class TestDataFrameCorr:
# DataFrame.corr(), as opposed to DataFrame.corrwith
@pytest.mark.parametrize("method", ["pearson", "kendall", "spearman"])
def test_corr_scipy_method(self, float_frame, method):
pytest.importorskip("scipy")
float_frame.loc[float_frame.index[:5], "A"] = np.nan
float_frame.loc[float_frame.index[5:10], "B"] = np.nan
float_frame.loc[float_frame.index[:10], "A"] = float_frame["A"][10:20].copy()
correls = float_frame.corr(method=method)
expected = float_frame["A"].corr(float_frame["C"], method=method)
tm.assert_almost_equal(correls["A"]["C"], expected)
# ---------------------------------------------------------------------
def test_corr_non_numeric(self, float_string_frame):
with pytest.raises(ValueError, match="could not convert string to float"):
float_string_frame.corr()
result = float_string_frame.corr(numeric_only=True)
expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
def test_corr_nooverlap(self, meth):
# nothing in common
pytest.importorskip("scipy")
df = DataFrame(
{
"A": [1, 1.5, 1, np.nan, np.nan, np.nan],
"B": [np.nan, np.nan, np.nan, 1, 1.5, 1],
"C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
}
)
rs = df.corr(meth)
assert isna(rs.loc["A", "B"])
assert isna(rs.loc["B", "A"])
assert rs.loc["A", "A"] == 1
assert rs.loc["B", "B"] == 1
assert isna(rs.loc["C", "C"])
@pytest.mark.parametrize("meth", ["pearson", "spearman"])
def test_corr_constant(self, meth):
# constant --> all NA
df = DataFrame(
{
"A": [1, 1, 1, np.nan, np.nan, np.nan],
"B": [np.nan, np.nan, np.nan, 1, 1, 1],
}
)
rs = df.corr(meth)
assert isna(rs.values).all()
@pytest.mark.filterwarnings("ignore::RuntimeWarning")
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
def test_corr_int_and_boolean(self, meth):
# when dtypes of pandas series are different
# then ndarray will have dtype=object,
# so it need to be properly handled
pytest.importorskip("scipy")
df = DataFrame({"a": [True, False], "b": [1, 0]})
expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"])
result = df.corr(meth)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", ["cov", "corr"])
def test_corr_cov_independent_index_column(self, method):
# GH#14617
df = DataFrame(
np.random.default_rng(2).standard_normal(4 * 10).reshape(10, 4),
columns=list("abcd"),
)
result = getattr(df, method)()
assert result.index is not result.columns
assert result.index.equals(result.columns)
def test_corr_invalid_method(self):
# GH#22298
df = DataFrame(np.random.default_rng(2).normal(size=(10, 2)))
msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, "
with pytest.raises(ValueError, match=msg):
df.corr(method="____")
def test_corr_int(self):
# dtypes other than float64 GH#1761
df = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]})
df.cov()
df.corr()
@pytest.mark.parametrize(
"nullable_column", [pd.array([1, 2, 3]), pd.array([1, 2, None])]
)
@pytest.mark.parametrize(
"other_column",
[pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, np.nan])],
)
@pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"])
def test_corr_nullable_integer(self, nullable_column, other_column, method):
# https://github.com/pandas-dev/pandas/issues/33803
pytest.importorskip("scipy")
data = DataFrame({"a": nullable_column, "b": other_column})
result = data.corr(method=method)
expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"])
tm.assert_frame_equal(result, expected)
def test_corr_item_cache(self, using_copy_on_write, warn_copy_on_write):
# Check that corr does not lead to incorrect entries in item_cache
df = DataFrame({"A": range(10)})
df["B"] = range(10)[::-1]
ser = df["A"] # populate item_cache
assert len(df._mgr.arrays) == 2 # i.e. 2 blocks
_ = df.corr(numeric_only=True)
if using_copy_on_write:
ser.iloc[0] = 99
assert df.loc[0, "A"] == 0
else:
# Check that the corr didn't break link between ser and df
ser.values[0] = 99
assert df.loc[0, "A"] == 99
if not warn_copy_on_write:
assert df["A"] is ser
assert df.values[0, 0] == 99
@pytest.mark.parametrize("length", [2, 20, 200, 2000])
def test_corr_for_constant_columns(self, length):
# GH: 37448
df = DataFrame(length * [[0.4, 0.1]], columns=["A", "B"])
result = df.corr()
expected = DataFrame(
{"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
)
tm.assert_frame_equal(result, expected)
def test_calc_corr_small_numbers(self):
# GH: 37452
df = DataFrame(
{"A": [1.0e-20, 2.0e-20, 3.0e-20], "B": [1.0e-20, 2.0e-20, 3.0e-20]}
)
result = df.corr()
expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"])
def test_corr_min_periods_greater_than_length(self, method):
pytest.importorskip("scipy")
df = DataFrame({"A": [1, 2], "B": [1, 2]})
result = df.corr(method=method, min_periods=3)
expected = DataFrame(
{"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
@pytest.mark.parametrize("numeric_only", [True, False])
def test_corr_numeric_only(self, meth, numeric_only):
# when dtypes of pandas series are different
# then ndarray will have dtype=object,
# so it need to be properly handled
pytest.importorskip("scipy")
df = DataFrame({"a": [1, 0], "b": [1, 0], "c": ["x", "y"]})
expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"])
if numeric_only:
result = df.corr(meth, numeric_only=numeric_only)
tm.assert_frame_equal(result, expected)
else:
with pytest.raises(ValueError, match="could not convert string to float"):
df.corr(meth, numeric_only=numeric_only)
class TestDataFrameCorrWith:
@pytest.mark.parametrize(
"dtype",
[
"float64",
"Float64",
pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")),
],
)
def test_corrwith(self, datetime_frame, dtype):
datetime_frame = datetime_frame.astype(dtype)
a = datetime_frame
noise = Series(np.random.default_rng(2).standard_normal(len(a)), index=a.index)
b = datetime_frame.add(noise, axis=0)
# make sure order does not matter
b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:])
del b["B"]
colcorr = a.corrwith(b, axis=0)
tm.assert_almost_equal(colcorr["A"], a["A"].corr(b["A"]))
rowcorr = a.corrwith(b, axis=1)
tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0))
dropped = a.corrwith(b, axis=0, drop=True)
tm.assert_almost_equal(dropped["A"], a["A"].corr(b["A"]))
assert "B" not in dropped
dropped = a.corrwith(b, axis=1, drop=True)
assert a.index[-1] not in dropped.index
# non time-series data
index = ["a", "b", "c", "d", "e"]
columns = ["one", "two", "three", "four"]
df1 = DataFrame(
np.random.default_rng(2).standard_normal((5, 4)),
index=index,
columns=columns,
)
df2 = DataFrame(
np.random.default_rng(2).standard_normal((4, 4)),
index=index[:4],
columns=columns,
)
correls = df1.corrwith(df2, axis=1)
for row in index[:4]:
tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row]))
def test_corrwith_with_objects(self, using_infer_string):
df1 = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df2 = df1.copy()
cols = ["A", "B", "C", "D"]
df1["obj"] = "foo"
df2["obj"] = "bar"
if using_infer_string:
import pyarrow as pa
with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"):
df1.corrwith(df2)
else:
with pytest.raises(TypeError, match="Could not convert"):
df1.corrwith(df2)
result = df1.corrwith(df2, numeric_only=True)
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols])
tm.assert_series_equal(result, expected)
with pytest.raises(TypeError, match="unsupported operand type"):
df1.corrwith(df2, axis=1)
result = df1.corrwith(df2, axis=1, numeric_only=True)
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1)
tm.assert_series_equal(result, expected)
def test_corrwith_series(self, datetime_frame):
result = datetime_frame.corrwith(datetime_frame["A"])
expected = datetime_frame.apply(datetime_frame["A"].corr)
tm.assert_series_equal(result, expected)
def test_corrwith_matches_corrcoef(self):
df1 = DataFrame(np.arange(10000), columns=["a"])
df2 = DataFrame(np.arange(10000) ** 2, columns=["a"])
c1 = df1.corrwith(df2)["a"]
c2 = np.corrcoef(df1["a"], df2["a"])[0][1]
tm.assert_almost_equal(c1, c2)
assert c1 < 1
@pytest.mark.parametrize("numeric_only", [True, False])
def test_corrwith_mixed_dtypes(self, numeric_only):
# GH#18570
df = DataFrame(
{"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]}
)
s = Series([0, 6, 7, 3])
if numeric_only:
result = df.corrwith(s, numeric_only=numeric_only)
corrs = [df["a"].corr(s), df["b"].corr(s)]
expected = Series(data=corrs, index=["a", "b"])
tm.assert_series_equal(result, expected)
else:
with pytest.raises(
ValueError,
match="could not convert string to float",
):
df.corrwith(s, numeric_only=numeric_only)
def test_corrwith_index_intersection(self):
df1 = DataFrame(
np.random.default_rng(2).random(size=(10, 2)), columns=["a", "b"]
)
df2 = DataFrame(
np.random.default_rng(2).random(size=(10, 3)), columns=["a", "b", "c"]
)
result = df1.corrwith(df2, drop=True).index.sort_values()
expected = df1.columns.intersection(df2.columns).sort_values()
tm.assert_index_equal(result, expected)
def test_corrwith_index_union(self):
df1 = DataFrame(
np.random.default_rng(2).random(size=(10, 2)), columns=["a", "b"]
)
df2 = DataFrame(
np.random.default_rng(2).random(size=(10, 3)), columns=["a", "b", "c"]
)
result = df1.corrwith(df2, drop=False).index.sort_values()
expected = df1.columns.union(df2.columns).sort_values()
tm.assert_index_equal(result, expected)
def test_corrwith_dup_cols(self):
# GH#21925
df1 = DataFrame(np.vstack([np.arange(10)] * 3).T)
df2 = df1.copy()
df2 = pd.concat((df2, df2[0]), axis=1)
result = df1.corrwith(df2)
expected = Series(np.ones(4), index=[0, 0, 1, 2])
tm.assert_series_equal(result, expected)
def test_corr_numerical_instabilities(self):
# GH#45640
df = DataFrame([[0.2, 0.4], [0.4, 0.2]])
result = df.corr()
expected = DataFrame({0: [1.0, -1.0], 1: [-1.0, 1.0]})
tm.assert_frame_equal(result - 1, expected - 1, atol=1e-17)
def test_corrwith_spearman(self):
# GH#21925
pytest.importorskip("scipy")
df = DataFrame(np.random.default_rng(2).random(size=(100, 3)))
result = df.corrwith(df**2, method="spearman")
expected = Series(np.ones(len(result)))
tm.assert_series_equal(result, expected)
def test_corrwith_kendall(self):
# GH#21925
pytest.importorskip("scipy")
df = DataFrame(np.random.default_rng(2).random(size=(100, 3)))
result = df.corrwith(df**2, method="kendall")
expected = Series(np.ones(len(result)))
tm.assert_series_equal(result, expected)
def test_corrwith_spearman_with_tied_data(self):
# GH#48826
pytest.importorskip("scipy")
df1 = DataFrame(
{
"A": [1, np.nan, 7, 8],
"B": [False, True, True, False],
"C": [10, 4, 9, 3],
}
)
df2 = df1[["B", "C"]]
result = (df1 + 1).corrwith(df2.B, method="spearman")
expected = Series([0.0, 1.0, 0.0], index=["A", "B", "C"])
tm.assert_series_equal(result, expected)
df_bool = DataFrame(
{"A": [True, True, False, False], "B": [True, False, False, True]}
)
ser_bool = Series([True, True, False, True])
result = df_bool.corrwith(ser_bool)
expected = Series([0.57735, 0.57735], index=["A", "B"])
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,417 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Series,
Timestamp,
date_range,
)
import pandas._testing as tm
class TestDataFrameDescribe:
def test_describe_bool_in_mixed_frame(self):
df = DataFrame(
{
"string_data": ["a", "b", "c", "d", "e"],
"bool_data": [True, True, False, False, False],
"int_data": [10, 20, 30, 40, 50],
}
)
# Integer data are included in .describe() output,
# Boolean and string data are not.
result = df.describe()
expected = DataFrame(
{"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]},
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
)
tm.assert_frame_equal(result, expected)
# Top value is a boolean value that is False
result = df.describe(include=["bool"])
expected = DataFrame(
{"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"]
)
tm.assert_frame_equal(result, expected)
def test_describe_empty_object(self):
# GH#27183
df = DataFrame({"A": [None, None]}, dtype=object)
result = df.describe()
expected = DataFrame(
{"A": [0, 0, np.nan, np.nan]},
dtype=object,
index=["count", "unique", "top", "freq"],
)
tm.assert_frame_equal(result, expected)
result = df.iloc[:0].describe()
tm.assert_frame_equal(result, expected)
def test_describe_bool_frame(self):
# GH#13891
df = DataFrame(
{
"bool_data_1": [False, False, True, True],
"bool_data_2": [False, True, True, True],
}
)
result = df.describe()
expected = DataFrame(
{"bool_data_1": [4, 2, False, 2], "bool_data_2": [4, 2, True, 3]},
index=["count", "unique", "top", "freq"],
)
tm.assert_frame_equal(result, expected)
df = DataFrame(
{
"bool_data": [False, False, True, True, False],
"int_data": [0, 1, 2, 3, 4],
}
)
result = df.describe()
expected = DataFrame(
{"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]},
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
)
tm.assert_frame_equal(result, expected)
df = DataFrame(
{"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]}
)
result = df.describe()
expected = DataFrame(
{"bool_data": [4, 2, False, 2], "str_data": [4, 3, "a", 2]},
index=["count", "unique", "top", "freq"],
)
tm.assert_frame_equal(result, expected)
def test_describe_categorical(self):
df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 100)})
labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)]
cat_labels = Categorical(labels, labels)
df = df.sort_values(by=["value"], ascending=True)
df["value_group"] = pd.cut(
df.value, range(0, 10500, 500), right=False, labels=cat_labels
)
cat = df
# Categoricals should not show up together with numerical columns
result = cat.describe()
assert len(result.columns) == 1
# In a frame, describe() for the cat should be the same as for string
# arrays (count, unique, top, freq)
cat = Categorical(
["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True
)
s = Series(cat)
result = s.describe()
expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"])
tm.assert_series_equal(result, expected)
cat = Series(Categorical(["a", "b", "c", "c"]))
df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]})
result = df3.describe()
tm.assert_numpy_array_equal(result["cat"].values, result["s"].values)
def test_describe_empty_categorical_column(self):
# GH#26397
# Ensure the index of an empty categorical DataFrame column
# also contains (count, unique, top, freq)
df = DataFrame({"empty_col": Categorical([])})
result = df.describe()
expected = DataFrame(
{"empty_col": [0, 0, np.nan, np.nan]},
index=["count", "unique", "top", "freq"],
dtype="object",
)
tm.assert_frame_equal(result, expected)
# ensure NaN, not None
assert np.isnan(result.iloc[2, 0])
assert np.isnan(result.iloc[3, 0])
def test_describe_categorical_columns(self):
# GH#11558
columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX")
df = DataFrame(
{
"int1": [10, 20, 30, 40, 50],
"int2": [10, 20, 30, 40, 50],
"obj": ["A", 0, None, "X", 1],
},
columns=columns,
)
result = df.describe()
exp_columns = pd.CategoricalIndex(
["int1", "int2"],
categories=["int1", "int2", "obj"],
ordered=True,
name="XXX",
)
expected = DataFrame(
{
"int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50],
"int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50],
},
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
columns=exp_columns,
)
tm.assert_frame_equal(result, expected)
tm.assert_categorical_equal(result.columns.values, expected.columns.values)
def test_describe_datetime_columns(self):
columns = pd.DatetimeIndex(
["2011-01-01", "2011-02-01", "2011-03-01"],
freq="MS",
tz="US/Eastern",
name="XXX",
)
df = DataFrame(
{
0: [10, 20, 30, 40, 50],
1: [10, 20, 30, 40, 50],
2: ["A", 0, None, "X", 1],
}
)
df.columns = columns
result = df.describe()
exp_columns = pd.DatetimeIndex(
["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX"
)
expected = DataFrame(
{
0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50],
1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50],
},
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
)
expected.columns = exp_columns
tm.assert_frame_equal(result, expected)
assert result.columns.freq == "MS"
assert result.columns.tz == expected.columns.tz
def test_describe_timedelta_values(self):
# GH#6145
t1 = pd.timedelta_range("1 days", freq="D", periods=5)
t2 = pd.timedelta_range("1 hours", freq="h", periods=5)
df = DataFrame({"t1": t1, "t2": t2})
expected = DataFrame(
{
"t1": [
5,
pd.Timedelta("3 days"),
df.iloc[:, 0].std(),
pd.Timedelta("1 days"),
pd.Timedelta("2 days"),
pd.Timedelta("3 days"),
pd.Timedelta("4 days"),
pd.Timedelta("5 days"),
],
"t2": [
5,
pd.Timedelta("3 hours"),
df.iloc[:, 1].std(),
pd.Timedelta("1 hours"),
pd.Timedelta("2 hours"),
pd.Timedelta("3 hours"),
pd.Timedelta("4 hours"),
pd.Timedelta("5 hours"),
],
},
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
)
result = df.describe()
tm.assert_frame_equal(result, expected)
exp_repr = (
" t1 t2\n"
"count 5 5\n"
"mean 3 days 00:00:00 0 days 03:00:00\n"
"std 1 days 13:56:50.394919273 0 days 01:34:52.099788303\n"
"min 1 days 00:00:00 0 days 01:00:00\n"
"25% 2 days 00:00:00 0 days 02:00:00\n"
"50% 3 days 00:00:00 0 days 03:00:00\n"
"75% 4 days 00:00:00 0 days 04:00:00\n"
"max 5 days 00:00:00 0 days 05:00:00"
)
assert repr(result) == exp_repr
def test_describe_tz_values(self, tz_naive_fixture):
# GH#21332
tz = tz_naive_fixture
s1 = Series(range(5))
start = Timestamp(2018, 1, 1)
end = Timestamp(2018, 1, 5)
s2 = Series(date_range(start, end, tz=tz))
df = DataFrame({"s1": s1, "s2": s2})
expected = DataFrame(
{
"s1": [5, 2, 0, 1, 2, 3, 4, 1.581139],
"s2": [
5,
Timestamp(2018, 1, 3).tz_localize(tz),
start.tz_localize(tz),
s2[1],
s2[2],
s2[3],
end.tz_localize(tz),
np.nan,
],
},
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
)
result = df.describe(include="all")
tm.assert_frame_equal(result, expected)
def test_datetime_is_numeric_includes_datetime(self):
df = DataFrame({"a": date_range("2012", periods=3), "b": [1, 2, 3]})
result = df.describe()
expected = DataFrame(
{
"a": [
3,
Timestamp("2012-01-02"),
Timestamp("2012-01-01"),
Timestamp("2012-01-01T12:00:00"),
Timestamp("2012-01-02"),
Timestamp("2012-01-02T12:00:00"),
Timestamp("2012-01-03"),
np.nan,
],
"b": [3, 2, 1, 1.5, 2, 2.5, 3, 1],
},
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
)
tm.assert_frame_equal(result, expected)
def test_describe_tz_values2(self):
tz = "CET"
s1 = Series(range(5))
start = Timestamp(2018, 1, 1)
end = Timestamp(2018, 1, 5)
s2 = Series(date_range(start, end, tz=tz))
df = DataFrame({"s1": s1, "s2": s2})
s1_ = s1.describe()
s2_ = s2.describe()
idx = [
"count",
"mean",
"min",
"25%",
"50%",
"75%",
"max",
"std",
]
expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).reindex(
idx, copy=False
)
result = df.describe(include="all")
tm.assert_frame_equal(result, expected)
def test_describe_percentiles_integer_idx(self):
# GH#26660
df = DataFrame({"x": [1]})
pct = np.linspace(0, 1, 10 + 1)
result = df.describe(percentiles=pct)
expected = DataFrame(
{"x": [1.0, 1.0, np.nan, 1.0, *(1.0 for _ in pct), 1.0]},
index=[
"count",
"mean",
"std",
"min",
"0%",
"10%",
"20%",
"30%",
"40%",
"50%",
"60%",
"70%",
"80%",
"90%",
"100%",
"max",
],
)
tm.assert_frame_equal(result, expected)
def test_describe_does_not_raise_error_for_dictlike_elements(self):
# GH#32409
df = DataFrame([{"test": {"a": "1"}}, {"test": {"a": "2"}}])
expected = DataFrame(
{"test": [2, 2, {"a": "1"}, 1]}, index=["count", "unique", "top", "freq"]
)
result = df.describe()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("exclude", ["x", "y", ["x", "y"], ["x", "z"]])
def test_describe_when_include_all_exclude_not_allowed(self, exclude):
"""
When include is 'all', then setting exclude != None is not allowed.
"""
df = DataFrame({"x": [1], "y": [2], "z": [3]})
msg = "exclude must be None when include is 'all'"
with pytest.raises(ValueError, match=msg):
df.describe(include="all", exclude=exclude)
def test_describe_with_duplicate_columns(self):
df = DataFrame(
[[1, 1, 1], [2, 2, 2], [3, 3, 3]],
columns=["bar", "a", "a"],
dtype="float64",
)
result = df.describe()
ser = df.iloc[:, 0].describe()
expected = pd.concat([ser, ser, ser], keys=df.columns, axis=1)
tm.assert_frame_equal(result, expected)
def test_ea_with_na(self, any_numeric_ea_dtype):
# GH#48778
df = DataFrame({"a": [1, pd.NA, pd.NA], "b": pd.NA}, dtype=any_numeric_ea_dtype)
result = df.describe()
expected = DataFrame(
{"a": [1.0, 1.0, pd.NA] + [1.0] * 5, "b": [0.0] + [pd.NA] * 7},
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
dtype="Float64",
)
tm.assert_frame_equal(result, expected)
def test_describe_exclude_pa_dtype(self):
# GH#52570
pa = pytest.importorskip("pyarrow")
df = DataFrame(
{
"a": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int8())),
"b": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int16())),
"c": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int32())),
}
)
result = df.describe(
include=pd.ArrowDtype(pa.int8()), exclude=pd.ArrowDtype(pa.int32())
)
expected = DataFrame(
{"a": [3, 2, 1, 1, 1.5, 2, 2.5, 3]},
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
dtype=pd.ArrowDtype(pa.float64()),
)
tm.assert_frame_equal(result, expected)

Some files were not shown because too many files have changed in this diff Show More