I am done

This commit is contained in:
2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions

View File

@ -0,0 +1,7 @@
"""
Test files dedicated to individual (stand-alone) Series methods
Ideally these files/tests should correspond 1-to-1 with tests.frame.methods
These may also present opportunities for sharing/de-duplicating test code.
"""

View File

@ -0,0 +1,41 @@
import pytest
from pandas import Index
import pandas._testing as tm
def test_add_prefix_suffix(string_series):
with_prefix = string_series.add_prefix("foo#")
expected = Index([f"foo#{c}" for c in string_series.index])
tm.assert_index_equal(with_prefix.index, expected)
with_suffix = string_series.add_suffix("#foo")
expected = Index([f"{c}#foo" for c in string_series.index])
tm.assert_index_equal(with_suffix.index, expected)
with_pct_prefix = string_series.add_prefix("%")
expected = Index([f"%{c}" for c in string_series.index])
tm.assert_index_equal(with_pct_prefix.index, expected)
with_pct_suffix = string_series.add_suffix("%")
expected = Index([f"{c}%" for c in string_series.index])
tm.assert_index_equal(with_pct_suffix.index, expected)
def test_add_prefix_suffix_axis(string_series):
# GH 47819
with_prefix = string_series.add_prefix("foo#", axis=0)
expected = Index([f"foo#{c}" for c in string_series.index])
tm.assert_index_equal(with_prefix.index, expected)
with_pct_suffix = string_series.add_suffix("#foo", axis=0)
expected = Index([f"{c}#foo" for c in string_series.index])
tm.assert_index_equal(with_pct_suffix.index, expected)
def test_add_prefix_suffix_invalid_axis(string_series):
with pytest.raises(ValueError, match="No axis named 1 for object type Series"):
string_series.add_prefix("foo#", axis=1)
with pytest.raises(ValueError, match="No axis named 1 for object type Series"):
string_series.add_suffix("foo#", axis=1)

View File

@ -0,0 +1,249 @@
from datetime import timezone
import numpy as np
import pytest
import pandas as pd
from pandas import (
Series,
date_range,
period_range,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"first_slice,second_slice",
[
[[2, None], [None, -5]],
[[None, 0], [None, -5]],
[[None, -5], [None, 0]],
[[None, 0], [None, 0]],
],
)
@pytest.mark.parametrize("fill", [None, -1])
def test_align(datetime_series, first_slice, second_slice, join_type, fill):
a = datetime_series[slice(*first_slice)]
b = datetime_series[slice(*second_slice)]
aa, ab = a.align(b, join=join_type, fill_value=fill)
join_index = a.index.join(b.index, how=join_type)
if fill is not None:
diff_a = aa.index.difference(join_index)
diff_b = ab.index.difference(join_index)
if len(diff_a) > 0:
assert (aa.reindex(diff_a) == fill).all()
if len(diff_b) > 0:
assert (ab.reindex(diff_b) == fill).all()
ea = a.reindex(join_index)
eb = b.reindex(join_index)
if fill is not None:
ea = ea.fillna(fill)
eb = eb.fillna(fill)
tm.assert_series_equal(aa, ea)
tm.assert_series_equal(ab, eb)
assert aa.name == "ts"
assert ea.name == "ts"
assert ab.name == "ts"
assert eb.name == "ts"
@pytest.mark.parametrize(
"first_slice,second_slice",
[
[[2, None], [None, -5]],
[[None, 0], [None, -5]],
[[None, -5], [None, 0]],
[[None, 0], [None, 0]],
],
)
@pytest.mark.parametrize("method", ["pad", "bfill"])
@pytest.mark.parametrize("limit", [None, 1])
def test_align_fill_method(
datetime_series, first_slice, second_slice, join_type, method, limit
):
a = datetime_series[slice(*first_slice)]
b = datetime_series[slice(*second_slice)]
msg = (
"The 'method', 'limit', and 'fill_axis' keywords in Series.align "
"are deprecated"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
aa, ab = a.align(b, join=join_type, method=method, limit=limit)
join_index = a.index.join(b.index, how=join_type)
ea = a.reindex(join_index)
eb = b.reindex(join_index)
msg2 = "Series.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg2):
ea = ea.fillna(method=method, limit=limit)
eb = eb.fillna(method=method, limit=limit)
tm.assert_series_equal(aa, ea)
tm.assert_series_equal(ab, eb)
def test_align_nocopy(datetime_series, using_copy_on_write):
b = datetime_series[:5].copy()
# do copy
a = datetime_series.copy()
ra, _ = a.align(b, join="left")
ra[:5] = 5
assert not (a[:5] == 5).any()
# do not copy
a = datetime_series.copy()
ra, _ = a.align(b, join="left", copy=False)
ra[:5] = 5
if using_copy_on_write:
assert not (a[:5] == 5).any()
else:
assert (a[:5] == 5).all()
# do copy
a = datetime_series.copy()
b = datetime_series[:5].copy()
_, rb = a.align(b, join="right")
rb[:3] = 5
assert not (b[:3] == 5).any()
# do not copy
a = datetime_series.copy()
b = datetime_series[:5].copy()
_, rb = a.align(b, join="right", copy=False)
rb[:2] = 5
if using_copy_on_write:
assert not (b[:2] == 5).any()
else:
assert (b[:2] == 5).all()
def test_align_same_index(datetime_series, using_copy_on_write):
a, b = datetime_series.align(datetime_series, copy=False)
if not using_copy_on_write:
assert a.index is datetime_series.index
assert b.index is datetime_series.index
else:
assert a.index.is_(datetime_series.index)
assert b.index.is_(datetime_series.index)
a, b = datetime_series.align(datetime_series, copy=True)
assert a.index is not datetime_series.index
assert b.index is not datetime_series.index
assert a.index.is_(datetime_series.index)
assert b.index.is_(datetime_series.index)
def test_align_multiindex():
# GH 10665
midx = pd.MultiIndex.from_product(
[range(2), range(3), range(2)], names=("a", "b", "c")
)
idx = pd.Index(range(2), name="b")
s1 = Series(np.arange(12, dtype="int64"), index=midx)
s2 = Series(np.arange(2, dtype="int64"), index=idx)
# these must be the same results (but flipped)
res1l, res1r = s1.align(s2, join="left")
res2l, res2r = s2.align(s1, join="right")
expl = s1
tm.assert_series_equal(expl, res1l)
tm.assert_series_equal(expl, res2r)
expr = Series([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx)
tm.assert_series_equal(expr, res1r)
tm.assert_series_equal(expr, res2l)
res1l, res1r = s1.align(s2, join="right")
res2l, res2r = s2.align(s1, join="left")
exp_idx = pd.MultiIndex.from_product(
[range(2), range(2), range(2)], names=("a", "b", "c")
)
expl = Series([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx)
tm.assert_series_equal(expl, res1l)
tm.assert_series_equal(expl, res2r)
expr = Series([0, 0, 1, 1] * 2, index=exp_idx)
tm.assert_series_equal(expr, res1r)
tm.assert_series_equal(expr, res2l)
@pytest.mark.parametrize("method", ["backfill", "bfill", "pad", "ffill", None])
def test_align_with_dataframe_method(method):
# GH31788
ser = Series(range(3), index=range(3))
df = pd.DataFrame(0.0, index=range(3), columns=range(3))
msg = (
"The 'method', 'limit', and 'fill_axis' keywords in Series.align "
"are deprecated"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
result_ser, result_df = ser.align(df, method=method)
tm.assert_series_equal(result_ser, ser)
tm.assert_frame_equal(result_df, df)
def test_align_dt64tzindex_mismatched_tzs():
idx1 = date_range("2001", periods=5, freq="h", tz="US/Eastern")
ser = Series(np.random.default_rng(2).standard_normal(len(idx1)), index=idx1)
ser_central = ser.tz_convert("US/Central")
# different timezones convert to UTC
new1, new2 = ser.align(ser_central)
assert new1.index.tz is timezone.utc
assert new2.index.tz is timezone.utc
def test_align_periodindex(join_type):
rng = period_range("1/1/2000", "1/1/2010", freq="Y")
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
# TODO: assert something?
ts.align(ts[::2], join=join_type)
def test_align_left_fewer_levels():
# GH#45224
left = Series([2], index=pd.MultiIndex.from_tuples([(1, 3)], names=["a", "c"]))
right = Series(
[1], index=pd.MultiIndex.from_tuples([(1, 2, 3)], names=["a", "b", "c"])
)
result_left, result_right = left.align(right)
expected_right = Series(
[1], index=pd.MultiIndex.from_tuples([(1, 3, 2)], names=["a", "c", "b"])
)
expected_left = Series(
[2], index=pd.MultiIndex.from_tuples([(1, 3, 2)], names=["a", "c", "b"])
)
tm.assert_series_equal(result_left, expected_left)
tm.assert_series_equal(result_right, expected_right)
def test_align_left_different_named_levels():
# GH#45224
left = Series(
[2], index=pd.MultiIndex.from_tuples([(1, 4, 3)], names=["a", "d", "c"])
)
right = Series(
[1], index=pd.MultiIndex.from_tuples([(1, 2, 3)], names=["a", "b", "c"])
)
result_left, result_right = left.align(right)
expected_left = Series(
[2], index=pd.MultiIndex.from_tuples([(1, 4, 3, 2)], names=["a", "d", "c", "b"])
)
expected_right = Series(
[1], index=pd.MultiIndex.from_tuples([(1, 4, 3, 2)], names=["a", "d", "c", "b"])
)
tm.assert_series_equal(result_left, expected_left)
tm.assert_series_equal(result_right, expected_right)

View File

@ -0,0 +1,84 @@
import numpy as np
import pytest
from pandas import (
Series,
Timestamp,
isna,
)
import pandas._testing as tm
class TestSeriesArgsort:
def test_argsort_axis(self):
# GH#54257
ser = Series(range(3))
msg = "No axis named 2 for object type Series"
with pytest.raises(ValueError, match=msg):
ser.argsort(axis=2)
def test_argsort_numpy(self, datetime_series):
ser = datetime_series
res = np.argsort(ser).values
expected = np.argsort(np.array(ser))
tm.assert_numpy_array_equal(res, expected)
# with missing values
ts = ser.copy()
ts[::2] = np.nan
msg = "The behavior of Series.argsort in the presence of NA values"
with tm.assert_produces_warning(
FutureWarning, match=msg, check_stacklevel=False
):
result = np.argsort(ts)[1::2]
expected = np.argsort(np.array(ts.dropna()))
tm.assert_numpy_array_equal(result.values, expected)
def test_argsort(self, datetime_series):
argsorted = datetime_series.argsort()
assert issubclass(argsorted.dtype.type, np.integer)
def test_argsort_dt64(self, unit):
# GH#2967 (introduced bug in 0.11-dev I think)
ser = Series(
[Timestamp(f"201301{i:02d}") for i in range(1, 6)], dtype=f"M8[{unit}]"
)
assert ser.dtype == f"datetime64[{unit}]"
shifted = ser.shift(-1)
assert shifted.dtype == f"datetime64[{unit}]"
assert isna(shifted[4])
result = ser.argsort()
expected = Series(range(5), dtype=np.intp)
tm.assert_series_equal(result, expected)
msg = "The behavior of Series.argsort in the presence of NA values"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = shifted.argsort()
expected = Series(list(range(4)) + [-1], dtype=np.intp)
tm.assert_series_equal(result, expected)
def test_argsort_stable(self):
ser = Series(np.random.default_rng(2).integers(0, 100, size=10000))
mindexer = ser.argsort(kind="mergesort")
qindexer = ser.argsort()
mexpected = np.argsort(ser.values, kind="mergesort")
qexpected = np.argsort(ser.values, kind="quicksort")
tm.assert_series_equal(mindexer.astype(np.intp), Series(mexpected))
tm.assert_series_equal(qindexer.astype(np.intp), Series(qexpected))
msg = (
r"ndarray Expected type <class 'numpy\.ndarray'>, "
r"found <class 'pandas\.core\.series\.Series'> instead"
)
with pytest.raises(AssertionError, match=msg):
tm.assert_numpy_array_equal(qindexer, mindexer)
def test_argsort_preserve_name(self, datetime_series):
result = datetime_series.argsort()
assert result.name == datetime_series.name

View File

@ -0,0 +1,205 @@
import numpy as np
import pytest
from pandas._libs.tslibs import IncompatibleFrequency
from pandas import (
DatetimeIndex,
PeriodIndex,
Series,
Timestamp,
date_range,
isna,
notna,
offsets,
period_range,
)
import pandas._testing as tm
class TestSeriesAsof:
def test_asof_nanosecond_index_access(self):
ts = Timestamp("20130101").as_unit("ns")._value
dti = DatetimeIndex([ts + 50 + i for i in range(100)])
ser = Series(np.random.default_rng(2).standard_normal(100), index=dti)
first_value = ser.asof(ser.index[0])
# GH#46903 previously incorrectly was "day"
assert dti.resolution == "nanosecond"
# this used to not work bc parsing was done by dateutil that didn't
# handle nanoseconds
assert first_value == ser["2013-01-01 00:00:00.000000050"]
expected_ts = np.datetime64("2013-01-01 00:00:00.000000050", "ns")
assert first_value == ser[Timestamp(expected_ts)]
def test_basic(self):
# array or list or dates
N = 50
rng = date_range("1/1/1990", periods=N, freq="53s")
ts = Series(np.random.default_rng(2).standard_normal(N), index=rng)
ts.iloc[15:30] = np.nan
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
result = ts.asof(dates)
assert notna(result).all()
lb = ts.index[14]
ub = ts.index[30]
result = ts.asof(list(dates))
assert notna(result).all()
lb = ts.index[14]
ub = ts.index[30]
mask = (result.index >= lb) & (result.index < ub)
rs = result[mask]
assert (rs == ts[lb]).all()
val = result[result.index[result.index >= ub][0]]
assert ts[ub] == val
def test_scalar(self):
N = 30
rng = date_range("1/1/1990", periods=N, freq="53s")
# Explicit cast to float avoid implicit cast when setting nan
ts = Series(np.arange(N), index=rng, dtype="float")
ts.iloc[5:10] = np.nan
ts.iloc[15:20] = np.nan
val1 = ts.asof(ts.index[7])
val2 = ts.asof(ts.index[19])
assert val1 == ts.iloc[4]
assert val2 == ts.iloc[14]
# accepts strings
val1 = ts.asof(str(ts.index[7]))
assert val1 == ts.iloc[4]
# in there
result = ts.asof(ts.index[3])
assert result == ts.iloc[3]
# no as of value
d = ts.index[0] - offsets.BDay()
assert np.isnan(ts.asof(d))
def test_with_nan(self):
# basic asof test
rng = date_range("1/1/2000", "1/2/2000", freq="4h")
s = Series(np.arange(len(rng)), index=rng)
r = s.resample("2h").mean()
result = r.asof(r.index)
expected = Series(
[0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6.0],
index=date_range("1/1/2000", "1/2/2000", freq="2h"),
)
tm.assert_series_equal(result, expected)
r.iloc[3:5] = np.nan
result = r.asof(r.index)
expected = Series(
[0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 5, 5, 6.0],
index=date_range("1/1/2000", "1/2/2000", freq="2h"),
)
tm.assert_series_equal(result, expected)
r.iloc[-3:] = np.nan
result = r.asof(r.index)
expected = Series(
[0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4, 4.0],
index=date_range("1/1/2000", "1/2/2000", freq="2h"),
)
tm.assert_series_equal(result, expected)
def test_periodindex(self):
# array or list or dates
N = 50
rng = period_range("1/1/1990", periods=N, freq="h")
ts = Series(np.random.default_rng(2).standard_normal(N), index=rng)
ts.iloc[15:30] = np.nan
dates = date_range("1/1/1990", periods=N * 3, freq="37min")
result = ts.asof(dates)
assert notna(result).all()
lb = ts.index[14]
ub = ts.index[30]
result = ts.asof(list(dates))
assert notna(result).all()
lb = ts.index[14]
ub = ts.index[30]
pix = PeriodIndex(result.index.values, freq="h")
mask = (pix >= lb) & (pix < ub)
rs = result[mask]
assert (rs == ts[lb]).all()
ts.iloc[5:10] = np.nan
ts.iloc[15:20] = np.nan
val1 = ts.asof(ts.index[7])
val2 = ts.asof(ts.index[19])
assert val1 == ts.iloc[4]
assert val2 == ts.iloc[14]
# accepts strings
val1 = ts.asof(str(ts.index[7]))
assert val1 == ts.iloc[4]
# in there
assert ts.asof(ts.index[3]) == ts.iloc[3]
# no as of value
d = ts.index[0].to_timestamp() - offsets.BDay()
assert isna(ts.asof(d))
# Mismatched freq
msg = "Input has different freq"
with pytest.raises(IncompatibleFrequency, match=msg):
ts.asof(rng.asfreq("D"))
def test_errors(self):
s = Series(
[1, 2, 3],
index=[Timestamp("20130101"), Timestamp("20130103"), Timestamp("20130102")],
)
# non-monotonic
assert not s.index.is_monotonic_increasing
with pytest.raises(ValueError, match="requires a sorted index"):
s.asof(s.index[0])
# subset with Series
N = 10
rng = date_range("1/1/1990", periods=N, freq="53s")
s = Series(np.random.default_rng(2).standard_normal(N), index=rng)
with pytest.raises(ValueError, match="not valid for Series"):
s.asof(s.index[0], subset="foo")
def test_all_nans(self):
# GH 15713
# series is all nans
# testing non-default indexes
N = 50
rng = date_range("1/1/1990", periods=N, freq="53s")
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
result = Series(np.nan, index=rng).asof(dates)
expected = Series(np.nan, index=dates)
tm.assert_series_equal(result, expected)
# testing scalar input
date = date_range("1/1/1990", periods=N * 3, freq="25s")[0]
result = Series(np.nan, index=rng).asof(date)
assert isna(result)
# test name is propagated
result = Series(np.nan, index=[1, 2, 3, 4], name="test").asof([4, 5])
expected = Series(np.nan, index=[4, 5], name="test")
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,683 @@
from datetime import (
datetime,
timedelta,
)
from importlib import reload
import string
import sys
import numpy as np
import pytest
from pandas._libs.tslibs import iNaT
import pandas.util._test_decorators as td
from pandas import (
NA,
Categorical,
CategoricalDtype,
DatetimeTZDtype,
Index,
Interval,
NaT,
Series,
Timedelta,
Timestamp,
cut,
date_range,
to_datetime,
)
import pandas._testing as tm
def rand_str(nchars: int) -> str:
"""
Generate one random byte string.
"""
RANDS_CHARS = np.array(
list(string.ascii_letters + string.digits), dtype=(np.str_, 1)
)
return "".join(np.random.default_rng(2).choice(RANDS_CHARS, nchars))
class TestAstypeAPI:
def test_astype_unitless_dt64_raises(self):
# GH#47844
ser = Series(["1970-01-01", "1970-01-01", "1970-01-01"], dtype="datetime64[ns]")
df = ser.to_frame()
msg = "Casting to unit-less dtype 'datetime64' is not supported"
with pytest.raises(TypeError, match=msg):
ser.astype(np.datetime64)
with pytest.raises(TypeError, match=msg):
df.astype(np.datetime64)
with pytest.raises(TypeError, match=msg):
ser.astype("datetime64")
with pytest.raises(TypeError, match=msg):
df.astype("datetime64")
def test_arg_for_errors_in_astype(self):
# see GH#14878
ser = Series([1, 2, 3])
msg = (
r"Expected value of kwarg 'errors' to be one of \['raise', "
r"'ignore'\]\. Supplied value is 'False'"
)
with pytest.raises(ValueError, match=msg):
ser.astype(np.float64, errors=False)
ser.astype(np.int8, errors="raise")
@pytest.mark.parametrize("dtype_class", [dict, Series])
def test_astype_dict_like(self, dtype_class):
# see GH#7271
ser = Series(range(0, 10, 2), name="abc")
dt1 = dtype_class({"abc": str})
result = ser.astype(dt1)
expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype=object)
tm.assert_series_equal(result, expected)
dt2 = dtype_class({"abc": "float64"})
result = ser.astype(dt2)
expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype="float64", name="abc")
tm.assert_series_equal(result, expected)
dt3 = dtype_class({"abc": str, "def": str})
msg = (
"Only the Series name can be used for the key in Series dtype "
r"mappings\."
)
with pytest.raises(KeyError, match=msg):
ser.astype(dt3)
dt4 = dtype_class({0: str})
with pytest.raises(KeyError, match=msg):
ser.astype(dt4)
# GH#16717
# if dtypes provided is empty, it should error
if dtype_class is Series:
dt5 = dtype_class({}, dtype=object)
else:
dt5 = dtype_class({})
with pytest.raises(KeyError, match=msg):
ser.astype(dt5)
class TestAstype:
@pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
def test_astype_object_to_dt64_non_nano(self, tz):
# GH#55756, GH#54620
ts = Timestamp("2999-01-01")
dtype = "M8[us]"
if tz is not None:
dtype = f"M8[us, {tz}]"
vals = [ts, "2999-01-02 03:04:05.678910", 2500]
ser = Series(vals, dtype=object)
result = ser.astype(dtype)
# The 2500 is interpreted as microseconds, consistent with what
# we would get if we created DatetimeIndexes from vals[:2] and vals[2:]
# and concated the results.
pointwise = [
vals[0].tz_localize(tz),
Timestamp(vals[1], tz=tz),
to_datetime(vals[2], unit="us", utc=True).tz_convert(tz),
]
exp_vals = [x.as_unit("us").asm8 for x in pointwise]
exp_arr = np.array(exp_vals, dtype="M8[us]")
expected = Series(exp_arr, dtype="M8[us]")
if tz is not None:
expected = expected.dt.tz_localize("UTC").dt.tz_convert(tz)
tm.assert_series_equal(result, expected)
def test_astype_mixed_object_to_dt64tz(self):
# pre-2.0 this raised ValueError bc of tz mismatch
# xref GH#32581
ts = Timestamp("2016-01-04 05:06:07", tz="US/Pacific")
ts2 = ts.tz_convert("Asia/Tokyo")
ser = Series([ts, ts2], dtype=object)
res = ser.astype("datetime64[ns, Europe/Brussels]")
expected = Series(
[ts.tz_convert("Europe/Brussels"), ts2.tz_convert("Europe/Brussels")],
dtype="datetime64[ns, Europe/Brussels]",
)
tm.assert_series_equal(res, expected)
@pytest.mark.parametrize("dtype", np.typecodes["All"])
def test_astype_empty_constructor_equality(self, dtype):
# see GH#15524
if dtype not in (
"S",
"V", # poor support (if any) currently
"M",
"m", # Generic timestamps raise a ValueError. Already tested.
):
init_empty = Series([], dtype=dtype)
as_type_empty = Series([]).astype(dtype)
tm.assert_series_equal(init_empty, as_type_empty)
@pytest.mark.parametrize("dtype", [str, np.str_])
@pytest.mark.parametrize(
"series",
[
Series([string.digits * 10, rand_str(63), rand_str(64), rand_str(1000)]),
Series([string.digits * 10, rand_str(63), rand_str(64), np.nan, 1.0]),
],
)
def test_astype_str_map(self, dtype, series, using_infer_string):
# see GH#4405
result = series.astype(dtype)
expected = series.map(str)
if using_infer_string:
expected = expected.astype(object)
tm.assert_series_equal(result, expected)
def test_astype_float_to_period(self):
result = Series([np.nan]).astype("period[D]")
expected = Series([NaT], dtype="period[D]")
tm.assert_series_equal(result, expected)
def test_astype_no_pandas_dtype(self):
# https://github.com/pandas-dev/pandas/pull/24866
ser = Series([1, 2], dtype="int64")
# Don't have NumpyEADtype in the public API, so we use `.array.dtype`,
# which is a NumpyEADtype.
result = ser.astype(ser.array.dtype)
tm.assert_series_equal(result, ser)
@pytest.mark.parametrize("dtype", [np.datetime64, np.timedelta64])
def test_astype_generic_timestamp_no_frequency(self, dtype, request):
# see GH#15524, GH#15987
data = [1]
ser = Series(data)
if np.dtype(dtype).name not in ["timedelta64", "datetime64"]:
mark = pytest.mark.xfail(reason="GH#33890 Is assigned ns unit")
request.applymarker(mark)
msg = (
rf"The '{dtype.__name__}' dtype has no unit\. "
rf"Please pass in '{dtype.__name__}\[ns\]' instead."
)
with pytest.raises(ValueError, match=msg):
ser.astype(dtype)
def test_astype_dt64_to_str(self):
# GH#10442 : testing astype(str) is correct for Series/DatetimeIndex
dti = date_range("2012-01-01", periods=3)
result = Series(dti).astype(str)
expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object)
tm.assert_series_equal(result, expected)
def test_astype_dt64tz_to_str(self):
# GH#10442 : testing astype(str) is correct for Series/DatetimeIndex
dti_tz = date_range("2012-01-01", periods=3, tz="US/Eastern")
result = Series(dti_tz).astype(str)
expected = Series(
[
"2012-01-01 00:00:00-05:00",
"2012-01-02 00:00:00-05:00",
"2012-01-03 00:00:00-05:00",
],
dtype=object,
)
tm.assert_series_equal(result, expected)
def test_astype_datetime(self, unit):
ser = Series(iNaT, dtype=f"M8[{unit}]", index=range(5))
ser = ser.astype("O")
assert ser.dtype == np.object_
ser = Series([datetime(2001, 1, 2, 0, 0)])
ser = ser.astype("O")
assert ser.dtype == np.object_
ser = Series(
[datetime(2001, 1, 2, 0, 0) for i in range(3)], dtype=f"M8[{unit}]"
)
ser[1] = np.nan
assert ser.dtype == f"M8[{unit}]"
ser = ser.astype("O")
assert ser.dtype == np.object_
def test_astype_datetime64tz(self):
ser = Series(date_range("20130101", periods=3, tz="US/Eastern"))
# astype
result = ser.astype(object)
expected = Series(ser.astype(object), dtype=object)
tm.assert_series_equal(result, expected)
result = Series(ser.values).dt.tz_localize("UTC").dt.tz_convert(ser.dt.tz)
tm.assert_series_equal(result, ser)
# astype - object, preserves on construction
result = Series(ser.astype(object))
expected = ser.astype(object)
tm.assert_series_equal(result, expected)
# astype - datetime64[ns, tz]
msg = "Cannot use .astype to convert from timezone-naive"
with pytest.raises(TypeError, match=msg):
# dt64->dt64tz astype deprecated
Series(ser.values).astype("datetime64[ns, US/Eastern]")
with pytest.raises(TypeError, match=msg):
# dt64->dt64tz astype deprecated
Series(ser.values).astype(ser.dtype)
result = ser.astype("datetime64[ns, CET]")
expected = Series(date_range("20130101 06:00:00", periods=3, tz="CET"))
tm.assert_series_equal(result, expected)
def test_astype_str_cast_dt64(self):
# see GH#9757
ts = Series([Timestamp("2010-01-04 00:00:00")])
res = ts.astype(str)
expected = Series(["2010-01-04"], dtype=object)
tm.assert_series_equal(res, expected)
ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")])
res = ts.astype(str)
expected = Series(["2010-01-04 00:00:00-05:00"], dtype=object)
tm.assert_series_equal(res, expected)
def test_astype_str_cast_td64(self):
# see GH#9757
td = Series([Timedelta(1, unit="d")])
ser = td.astype(str)
expected = Series(["1 days"], dtype=object)
tm.assert_series_equal(ser, expected)
def test_dt64_series_astype_object(self):
dt64ser = Series(date_range("20130101", periods=3))
result = dt64ser.astype(object)
assert isinstance(result.iloc[0], datetime)
assert result.dtype == np.object_
def test_td64_series_astype_object(self):
tdser = Series(["59 Days", "59 Days", "NaT"], dtype="timedelta64[ns]")
result = tdser.astype(object)
assert isinstance(result.iloc[0], timedelta)
assert result.dtype == np.object_
@pytest.mark.parametrize(
"data, dtype",
[
(["x", "y", "z"], "string[python]"),
pytest.param(
["x", "y", "z"],
"string[pyarrow]",
marks=td.skip_if_no("pyarrow"),
),
(["x", "y", "z"], "category"),
(3 * [Timestamp("2020-01-01", tz="UTC")], None),
(3 * [Interval(0, 1)], None),
],
)
@pytest.mark.parametrize("errors", ["raise", "ignore"])
def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors):
# https://github.com/pandas-dev/pandas/issues/35471
ser = Series(data, dtype=dtype)
if errors == "ignore":
expected = ser
result = ser.astype(float, errors="ignore")
tm.assert_series_equal(result, expected)
else:
msg = "(Cannot cast)|(could not convert)"
with pytest.raises((ValueError, TypeError), match=msg):
ser.astype(float, errors=errors)
@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64])
def test_astype_from_float_to_str(self, dtype):
# https://github.com/pandas-dev/pandas/issues/36451
ser = Series([0.1], dtype=dtype)
result = ser.astype(str)
expected = Series(["0.1"], dtype=object)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"value, string_value",
[
(None, "None"),
(np.nan, "nan"),
(NA, "<NA>"),
],
)
def test_astype_to_str_preserves_na(self, value, string_value):
# https://github.com/pandas-dev/pandas/issues/36904
ser = Series(["a", "b", value], dtype=object)
result = ser.astype(str)
expected = Series(["a", "b", string_value], dtype=object)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"])
def test_astype(self, dtype):
ser = Series(np.random.default_rng(2).standard_normal(5), name="foo")
as_typed = ser.astype(dtype)
assert as_typed.dtype == dtype
assert as_typed.name == ser.name
@pytest.mark.parametrize("value", [np.nan, np.inf])
@pytest.mark.parametrize("dtype", [np.int32, np.int64])
def test_astype_cast_nan_inf_int(self, dtype, value):
# gh-14265: check NaN and inf raise error when converting to int
msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
ser = Series([value])
with pytest.raises(ValueError, match=msg):
ser.astype(dtype)
@pytest.mark.parametrize("dtype", [int, np.int8, np.int64])
def test_astype_cast_object_int_fail(self, dtype):
arr = Series(["car", "house", "tree", "1"])
msg = r"invalid literal for int\(\) with base 10: 'car'"
with pytest.raises(ValueError, match=msg):
arr.astype(dtype)
def test_astype_float_to_uint_negatives_raise(
self, float_numpy_dtype, any_unsigned_int_numpy_dtype
):
# GH#45151 We don't cast negative numbers to nonsense values
# TODO: same for EA float/uint dtypes, signed integers?
arr = np.arange(5).astype(float_numpy_dtype) - 3 # includes negatives
ser = Series(arr)
msg = "Cannot losslessly cast from .* to .*"
with pytest.raises(ValueError, match=msg):
ser.astype(any_unsigned_int_numpy_dtype)
with pytest.raises(ValueError, match=msg):
ser.to_frame().astype(any_unsigned_int_numpy_dtype)
with pytest.raises(ValueError, match=msg):
# We currently catch and re-raise in Index.astype
Index(ser).astype(any_unsigned_int_numpy_dtype)
with pytest.raises(ValueError, match=msg):
ser.array.astype(any_unsigned_int_numpy_dtype)
def test_astype_cast_object_int(self):
arr = Series(["1", "2", "3", "4"], dtype=object)
result = arr.astype(int)
tm.assert_series_equal(result, Series(np.arange(1, 5)))
def test_astype_unicode(self, using_infer_string):
# see GH#7758: A bit of magic is required to set
# default encoding to utf-8
digits = string.digits
test_series = [
Series([digits * 10, rand_str(63), rand_str(64), rand_str(1000)]),
Series(["データーサイエンス、お前はもう死んでいる"]),
]
former_encoding = None
if sys.getdefaultencoding() == "utf-8":
# GH#45326 as of 2.0 Series.astype matches Index.astype by handling
# bytes with obj.decode() instead of str(obj)
item = "野菜食べないとやばい"
ser = Series([item.encode()])
result = ser.astype(np.str_)
expected = Series([item], dtype=object)
tm.assert_series_equal(result, expected)
for ser in test_series:
res = ser.astype(np.str_)
expec = ser.map(str)
if using_infer_string:
expec = expec.astype(object)
tm.assert_series_equal(res, expec)
# Restore the former encoding
if former_encoding is not None and former_encoding != "utf-8":
reload(sys)
sys.setdefaultencoding(former_encoding)
def test_astype_bytes(self):
# GH#39474
result = Series(["foo", "bar", "baz"]).astype(bytes)
assert result.dtypes == np.dtype("S3")
def test_astype_nan_to_bool(self):
# GH#43018
ser = Series(np.nan, dtype="object")
result = ser.astype("bool")
expected = Series(True, dtype="bool")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"dtype",
tm.ALL_INT_EA_DTYPES + tm.FLOAT_EA_DTYPES,
)
def test_astype_ea_to_datetimetzdtype(self, dtype):
# GH37553
ser = Series([4, 0, 9], dtype=dtype)
result = ser.astype(DatetimeTZDtype(tz="US/Pacific"))
expected = Series(
{
0: Timestamp("1969-12-31 16:00:00.000000004-08:00", tz="US/Pacific"),
1: Timestamp("1969-12-31 16:00:00.000000000-08:00", tz="US/Pacific"),
2: Timestamp("1969-12-31 16:00:00.000000009-08:00", tz="US/Pacific"),
}
)
tm.assert_series_equal(result, expected)
def test_astype_retain_attrs(self, any_numpy_dtype):
# GH#44414
ser = Series([0, 1, 2, 3])
ser.attrs["Location"] = "Michigan"
result = ser.astype(any_numpy_dtype).attrs
expected = ser.attrs
tm.assert_dict_equal(expected, result)
class TestAstypeString:
@pytest.mark.parametrize(
"data, dtype",
[
([True, NA], "boolean"),
(["A", NA], "category"),
(["2020-10-10", "2020-10-10"], "datetime64[ns]"),
(["2020-10-10", "2020-10-10", NaT], "datetime64[ns]"),
(
["2012-01-01 00:00:00-05:00", NaT],
"datetime64[ns, US/Eastern]",
),
([1, None], "UInt16"),
(["1/1/2021", "2/1/2021"], "period[M]"),
(["1/1/2021", "2/1/2021", NaT], "period[M]"),
(["1 Day", "59 Days", NaT], "timedelta64[ns]"),
# currently no way to parse IntervalArray from a list of strings
],
)
def test_astype_string_to_extension_dtype_roundtrip(
self, data, dtype, request, nullable_string_dtype
):
if dtype == "boolean":
mark = pytest.mark.xfail(
reason="TODO StringArray.astype() with missing values #GH40566"
)
request.applymarker(mark)
# GH-40351
ser = Series(data, dtype=dtype)
# Note: just passing .astype(dtype) fails for dtype="category"
# with bc ser.dtype.categories will be object dtype whereas
# result.dtype.categories will have string dtype
result = ser.astype(nullable_string_dtype).astype(ser.dtype)
tm.assert_series_equal(result, ser)
class TestAstypeCategorical:
def test_astype_categorical_to_other(self):
cat = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)])
ser = Series(np.random.default_rng(2).integers(0, 10000, 100)).sort_values()
ser = cut(ser, range(0, 10500, 500), right=False, labels=cat)
expected = ser
tm.assert_series_equal(ser.astype("category"), expected)
tm.assert_series_equal(ser.astype(CategoricalDtype()), expected)
msg = r"Cannot cast object|string dtype to float64"
with pytest.raises(ValueError, match=msg):
ser.astype("float64")
cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]))
exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype=object)
tm.assert_series_equal(cat.astype("str"), exp)
s2 = Series(Categorical(["1", "2", "3", "4"]))
exp2 = Series([1, 2, 3, 4]).astype("int")
tm.assert_series_equal(s2.astype("int"), exp2)
# object don't sort correctly, so just compare that we have the same
# values
def cmp(a, b):
tm.assert_almost_equal(np.sort(np.unique(a)), np.sort(np.unique(b)))
expected = Series(np.array(ser.values), name="value_group")
cmp(ser.astype("object"), expected)
cmp(ser.astype(np.object_), expected)
# array conversion
tm.assert_almost_equal(np.array(ser), np.array(ser.values))
tm.assert_series_equal(ser.astype("category"), ser)
tm.assert_series_equal(ser.astype(CategoricalDtype()), ser)
roundtrip_expected = ser.cat.set_categories(
ser.cat.categories.sort_values()
).cat.remove_unused_categories()
result = ser.astype("object").astype("category")
tm.assert_series_equal(result, roundtrip_expected)
result = ser.astype("object").astype(CategoricalDtype())
tm.assert_series_equal(result, roundtrip_expected)
def test_astype_categorical_invalid_conversions(self):
# invalid conversion (these are NOT a dtype)
cat = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)])
ser = Series(np.random.default_rng(2).integers(0, 10000, 100)).sort_values()
ser = cut(ser, range(0, 10500, 500), right=False, labels=cat)
msg = (
"dtype '<class 'pandas.core.arrays.categorical.Categorical'>' "
"not understood"
)
with pytest.raises(TypeError, match=msg):
ser.astype(Categorical)
with pytest.raises(TypeError, match=msg):
ser.astype("object").astype(Categorical)
def test_astype_categoricaldtype(self):
ser = Series(["a", "b", "a"])
result = ser.astype(CategoricalDtype(["a", "b"], ordered=True))
expected = Series(Categorical(["a", "b", "a"], ordered=True))
tm.assert_series_equal(result, expected)
result = ser.astype(CategoricalDtype(["a", "b"], ordered=False))
expected = Series(Categorical(["a", "b", "a"], ordered=False))
tm.assert_series_equal(result, expected)
result = ser.astype(CategoricalDtype(["a", "b", "c"], ordered=False))
expected = Series(
Categorical(["a", "b", "a"], categories=["a", "b", "c"], ordered=False)
)
tm.assert_series_equal(result, expected)
tm.assert_index_equal(result.cat.categories, Index(["a", "b", "c"]))
@pytest.mark.parametrize("name", [None, "foo"])
@pytest.mark.parametrize("dtype_ordered", [True, False])
@pytest.mark.parametrize("series_ordered", [True, False])
def test_astype_categorical_to_categorical(
self, name, dtype_ordered, series_ordered
):
# GH#10696, GH#18593
s_data = list("abcaacbab")
s_dtype = CategoricalDtype(list("bac"), ordered=series_ordered)
ser = Series(s_data, dtype=s_dtype, name=name)
# unspecified categories
dtype = CategoricalDtype(ordered=dtype_ordered)
result = ser.astype(dtype)
exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered)
expected = Series(s_data, name=name, dtype=exp_dtype)
tm.assert_series_equal(result, expected)
# different categories
dtype = CategoricalDtype(list("adc"), dtype_ordered)
result = ser.astype(dtype)
expected = Series(s_data, name=name, dtype=dtype)
tm.assert_series_equal(result, expected)
if dtype_ordered is False:
# not specifying ordered, so only test once
expected = ser
result = ser.astype("category")
tm.assert_series_equal(result, expected)
def test_astype_bool_missing_to_categorical(self):
# GH-19182
ser = Series([True, False, np.nan])
assert ser.dtypes == np.object_
result = ser.astype(CategoricalDtype(categories=[True, False]))
expected = Series(Categorical([True, False, np.nan], categories=[True, False]))
tm.assert_series_equal(result, expected)
def test_astype_categories_raises(self):
# deprecated GH#17636, removed in GH#27141
ser = Series(["a", "b", "a"])
with pytest.raises(TypeError, match="got an unexpected"):
ser.astype("category", categories=["a", "b"], ordered=True)
@pytest.mark.parametrize("items", [["a", "b", "c", "a"], [1, 2, 3, 1]])
def test_astype_from_categorical(self, items):
ser = Series(items)
exp = Series(Categorical(items))
res = ser.astype("category")
tm.assert_series_equal(res, exp)
def test_astype_from_categorical_with_keywords(self):
# with keywords
lst = ["a", "b", "c", "a"]
ser = Series(lst)
exp = Series(Categorical(lst, ordered=True))
res = ser.astype(CategoricalDtype(None, ordered=True))
tm.assert_series_equal(res, exp)
exp = Series(Categorical(lst, categories=list("abcdef"), ordered=True))
res = ser.astype(CategoricalDtype(list("abcdef"), ordered=True))
tm.assert_series_equal(res, exp)
def test_astype_timedelta64_with_np_nan(self):
# GH45798
result = Series([Timedelta(1), np.nan], dtype="timedelta64[ns]")
expected = Series([Timedelta(1), NaT], dtype="timedelta64[ns]")
tm.assert_series_equal(result, expected)
@td.skip_if_no("pyarrow")
def test_astype_int_na_string(self):
# GH#57418
ser = Series([12, NA], dtype="Int64[pyarrow]")
result = ser.astype("string[pyarrow]")
expected = Series(["12", NA], dtype="string[pyarrow]")
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,30 @@
import numpy as np
class TestAutoCorr:
def test_autocorr(self, datetime_series):
# Just run the function
corr1 = datetime_series.autocorr()
# Now run it with the lag parameter
corr2 = datetime_series.autocorr(lag=1)
# corr() with lag needs Series of at least length 2
if len(datetime_series) <= 2:
assert np.isnan(corr1)
assert np.isnan(corr2)
else:
assert corr1 == corr2
# Choose a random lag between 1 and length of Series - 2
# and compare the result with the Series corr() function
n = 1 + np.random.default_rng(2).integers(max(1, len(datetime_series) - 2))
corr1 = datetime_series.corr(datetime_series.shift(n))
corr2 = datetime_series.autocorr(lag=n)
# corr() with lag needs Series of at least length 2
if len(datetime_series) <= 2:
assert np.isnan(corr1)
assert np.isnan(corr2)
else:
assert corr1 == corr2

View File

@ -0,0 +1,75 @@
import numpy as np
import pytest
from pandas import (
Series,
bdate_range,
date_range,
period_range,
)
import pandas._testing as tm
class TestBetween:
def test_between(self):
series = Series(date_range("1/1/2000", periods=10))
left, right = series[[2, 7]]
result = series.between(left, right)
expected = (series >= left) & (series <= right)
tm.assert_series_equal(result, expected)
def test_between_datetime_object_dtype(self):
ser = Series(bdate_range("1/1/2000", periods=20), dtype=object)
ser[::2] = np.nan
result = ser[ser.between(ser[3], ser[17])]
expected = ser[3:18].dropna()
tm.assert_series_equal(result, expected)
result = ser[ser.between(ser[3], ser[17], inclusive="neither")]
expected = ser[5:16].dropna()
tm.assert_series_equal(result, expected)
def test_between_period_values(self):
ser = Series(period_range("2000-01-01", periods=10, freq="D"))
left, right = ser[[2, 7]]
result = ser.between(left, right)
expected = (ser >= left) & (ser <= right)
tm.assert_series_equal(result, expected)
def test_between_inclusive_string(self):
# GH 40628
series = Series(date_range("1/1/2000", periods=10))
left, right = series[[2, 7]]
result = series.between(left, right, inclusive="both")
expected = (series >= left) & (series <= right)
tm.assert_series_equal(result, expected)
result = series.between(left, right, inclusive="left")
expected = (series >= left) & (series < right)
tm.assert_series_equal(result, expected)
result = series.between(left, right, inclusive="right")
expected = (series > left) & (series <= right)
tm.assert_series_equal(result, expected)
result = series.between(left, right, inclusive="neither")
expected = (series > left) & (series < right)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("inclusive", ["yes", True, False])
def test_between_error_args(self, inclusive):
# GH 40628
series = Series(date_range("1/1/2000", periods=10))
left, right = series[[2, 7]]
value_error_msg = (
"Inclusive has to be either string of 'both',"
"'left', 'right', or 'neither'."
)
with pytest.raises(ValueError, match=value_error_msg):
series = Series(date_range("1/1/2000", periods=10))
series.between(left, right, inclusive=inclusive)

View File

@ -0,0 +1,148 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Series,
array as pd_array,
date_range,
)
import pandas._testing as tm
@pytest.fixture
def df():
"""
base dataframe for testing
"""
return DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
def test_case_when_caselist_is_not_a_list(df):
"""
Raise ValueError if caselist is not a list.
"""
msg = "The caselist argument should be a list; "
msg += "instead got.+"
with pytest.raises(TypeError, match=msg): # GH39154
df["a"].case_when(caselist=())
def test_case_when_no_caselist(df):
"""
Raise ValueError if no caselist is provided.
"""
msg = "provide at least one boolean condition, "
msg += "with a corresponding replacement."
with pytest.raises(ValueError, match=msg): # GH39154
df["a"].case_when([])
def test_case_when_odd_caselist(df):
"""
Raise ValueError if no of caselist is odd.
"""
msg = "Argument 0 must have length 2; "
msg += "a condition and replacement; instead got length 3."
with pytest.raises(ValueError, match=msg):
df["a"].case_when([(df["a"].eq(1), 1, df.a.gt(1))])
def test_case_when_raise_error_from_mask(df):
"""
Raise Error from within Series.mask
"""
msg = "Failed to apply condition0 and replacement0."
with pytest.raises(ValueError, match=msg):
df["a"].case_when([(df["a"].eq(1), [1, 2])])
def test_case_when_single_condition(df):
"""
Test output on a single condition.
"""
result = Series([np.nan, np.nan, np.nan]).case_when([(df.a.eq(1), 1)])
expected = Series([1, np.nan, np.nan])
tm.assert_series_equal(result, expected)
def test_case_when_multiple_conditions(df):
"""
Test output when booleans are derived from a computation
"""
result = Series([np.nan, np.nan, np.nan]).case_when(
[(df.a.eq(1), 1), (Series([False, True, False]), 2)]
)
expected = Series([1, 2, np.nan])
tm.assert_series_equal(result, expected)
def test_case_when_multiple_conditions_replacement_list(df):
"""
Test output when replacement is a list
"""
result = Series([np.nan, np.nan, np.nan]).case_when(
[([True, False, False], 1), (df["a"].gt(1) & df["b"].eq(5), [1, 2, 3])]
)
expected = Series([1, 2, np.nan])
tm.assert_series_equal(result, expected)
def test_case_when_multiple_conditions_replacement_extension_dtype(df):
"""
Test output when replacement has an extension dtype
"""
result = Series([np.nan, np.nan, np.nan]).case_when(
[
([True, False, False], 1),
(df["a"].gt(1) & df["b"].eq(5), pd_array([1, 2, 3], dtype="Int64")),
],
)
expected = Series([1, 2, np.nan], dtype="Float64")
tm.assert_series_equal(result, expected)
def test_case_when_multiple_conditions_replacement_series(df):
"""
Test output when replacement is a Series
"""
result = Series([np.nan, np.nan, np.nan]).case_when(
[
(np.array([True, False, False]), 1),
(df["a"].gt(1) & df["b"].eq(5), Series([1, 2, 3])),
],
)
expected = Series([1, 2, np.nan])
tm.assert_series_equal(result, expected)
def test_case_when_non_range_index():
"""
Test output if index is not RangeIndex
"""
rng = np.random.default_rng(seed=123)
dates = date_range("1/1/2000", periods=8)
df = DataFrame(
rng.standard_normal(size=(8, 4)), index=dates, columns=["A", "B", "C", "D"]
)
result = Series(5, index=df.index, name="A").case_when([(df.A.gt(0), df.B)])
expected = df.A.mask(df.A.gt(0), df.B).where(df.A.gt(0), 5)
tm.assert_series_equal(result, expected)
def test_case_when_callable():
"""
Test output on a callable
"""
# https://numpy.org/doc/stable/reference/generated/numpy.piecewise.html
x = np.linspace(-2.5, 2.5, 6)
ser = Series(x)
result = ser.case_when(
caselist=[
(lambda df: df < 0, lambda df: -df),
(lambda df: df >= 0, lambda df: df),
]
)
expected = np.piecewise(x, [x < 0, x >= 0], [lambda x: -x, lambda x: x])
tm.assert_series_equal(result, Series(expected))

View File

@ -0,0 +1,146 @@
from datetime import datetime
import numpy as np
import pytest
import pandas as pd
from pandas import (
Series,
Timestamp,
isna,
notna,
)
import pandas._testing as tm
class TestSeriesClip:
def test_clip(self, datetime_series):
val = datetime_series.median()
assert datetime_series.clip(lower=val).min() == val
assert datetime_series.clip(upper=val).max() == val
result = datetime_series.clip(-0.5, 0.5)
expected = np.clip(datetime_series, -0.5, 0.5)
tm.assert_series_equal(result, expected)
assert isinstance(expected, Series)
def test_clip_types_and_nulls(self):
sers = [
Series([np.nan, 1.0, 2.0, 3.0]),
Series([None, "a", "b", "c"]),
Series(pd.to_datetime([np.nan, 1, 2, 3], unit="D")),
]
for s in sers:
thresh = s[2]
lower = s.clip(lower=thresh)
upper = s.clip(upper=thresh)
assert lower[notna(lower)].min() == thresh
assert upper[notna(upper)].max() == thresh
assert list(isna(s)) == list(isna(lower))
assert list(isna(s)) == list(isna(upper))
def test_series_clipping_with_na_values(self, any_numeric_ea_dtype, nulls_fixture):
# Ensure that clipping method can handle NA values with out failing
# GH#40581
if nulls_fixture is pd.NaT:
# constructor will raise, see
# test_constructor_mismatched_null_nullable_dtype
pytest.skip("See test_constructor_mismatched_null_nullable_dtype")
ser = Series([nulls_fixture, 1.0, 3.0], dtype=any_numeric_ea_dtype)
s_clipped_upper = ser.clip(upper=2.0)
s_clipped_lower = ser.clip(lower=2.0)
expected_upper = Series([nulls_fixture, 1.0, 2.0], dtype=any_numeric_ea_dtype)
expected_lower = Series([nulls_fixture, 2.0, 3.0], dtype=any_numeric_ea_dtype)
tm.assert_series_equal(s_clipped_upper, expected_upper)
tm.assert_series_equal(s_clipped_lower, expected_lower)
def test_clip_with_na_args(self):
"""Should process np.nan argument as None"""
# GH#17276
s = Series([1, 2, 3])
tm.assert_series_equal(s.clip(np.nan), Series([1, 2, 3]))
tm.assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3]))
# GH#19992
msg = "Downcasting behavior in Series and DataFrame methods 'where'"
# TODO: avoid this warning here? seems like we should never be upcasting
# in the first place?
with tm.assert_produces_warning(FutureWarning, match=msg):
res = s.clip(lower=[0, 4, np.nan])
tm.assert_series_equal(res, Series([1, 4, 3]))
with tm.assert_produces_warning(FutureWarning, match=msg):
res = s.clip(upper=[1, np.nan, 1])
tm.assert_series_equal(res, Series([1, 2, 1]))
# GH#40420
s = Series([1, 2, 3])
result = s.clip(0, [np.nan, np.nan, np.nan])
tm.assert_series_equal(s, result)
def test_clip_against_series(self):
# GH#6966
s = Series([1.0, 1.0, 4.0])
lower = Series([1.0, 2.0, 3.0])
upper = Series([1.5, 2.5, 3.5])
tm.assert_series_equal(s.clip(lower, upper), Series([1.0, 2.0, 3.5]))
tm.assert_series_equal(s.clip(1.5, upper), Series([1.5, 1.5, 3.5]))
@pytest.mark.parametrize("inplace", [True, False])
@pytest.mark.parametrize("upper", [[1, 2, 3], np.asarray([1, 2, 3])])
def test_clip_against_list_like(self, inplace, upper):
# GH#15390
original = Series([5, 6, 7])
result = original.clip(upper=upper, inplace=inplace)
expected = Series([1, 2, 3])
if inplace:
result = original
tm.assert_series_equal(result, expected, check_exact=True)
def test_clip_with_datetimes(self):
# GH#11838
# naive and tz-aware datetimes
t = Timestamp("2015-12-01 09:30:30")
s = Series([Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:31:00")])
result = s.clip(upper=t)
expected = Series(
[Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:30:30")]
)
tm.assert_series_equal(result, expected)
t = Timestamp("2015-12-01 09:30:30", tz="US/Eastern")
s = Series(
[
Timestamp("2015-12-01 09:30:00", tz="US/Eastern"),
Timestamp("2015-12-01 09:31:00", tz="US/Eastern"),
]
)
result = s.clip(upper=t)
expected = Series(
[
Timestamp("2015-12-01 09:30:00", tz="US/Eastern"),
Timestamp("2015-12-01 09:30:30", tz="US/Eastern"),
]
)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("dtype", [object, "M8[us]"])
def test_clip_with_timestamps_and_oob_datetimes(self, dtype):
# GH-42794
ser = Series([datetime(1, 1, 1), datetime(9999, 9, 9)], dtype=dtype)
result = ser.clip(lower=Timestamp.min, upper=Timestamp.max)
expected = Series([Timestamp.min, Timestamp.max], dtype=dtype)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,17 @@
from pandas import Series
import pandas._testing as tm
class TestCombine:
def test_combine_scalar(self):
# GH#21248
# Note - combine() with another Series is tested elsewhere because
# it is used when testing operators
ser = Series([i * 10 for i in range(5)])
result = ser.combine(3, lambda x, y: x + y)
expected = Series([i * 10 + 3 for i in range(5)])
tm.assert_series_equal(result, expected)
result = ser.combine(22, lambda x, y: min(x, y))
expected = Series([min(i * 10, 22) for i in range(5)])
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,149 @@
from datetime import datetime
import numpy as np
import pandas as pd
from pandas import (
Period,
Series,
date_range,
period_range,
to_datetime,
)
import pandas._testing as tm
class TestCombineFirst:
def test_combine_first_period_datetime(self):
# GH#3367
didx = date_range(start="1950-01-31", end="1950-07-31", freq="ME")
pidx = period_range(start=Period("1950-1"), end=Period("1950-7"), freq="M")
# check to be consistent with DatetimeIndex
for idx in [didx, pidx]:
a = Series([1, np.nan, np.nan, 4, 5, np.nan, 7], index=idx)
b = Series([9, 9, 9, 9, 9, 9, 9], index=idx)
result = a.combine_first(b)
expected = Series([1, 9, 9, 4, 5, 9, 7], index=idx, dtype=np.float64)
tm.assert_series_equal(result, expected)
def test_combine_first_name(self, datetime_series):
result = datetime_series.combine_first(datetime_series[:5])
assert result.name == datetime_series.name
def test_combine_first(self):
values = np.arange(20, dtype=np.float64)
series = Series(values, index=np.arange(20, dtype=np.int64))
series_copy = series * 2
series_copy[::2] = np.nan
# nothing used from the input
combined = series.combine_first(series_copy)
tm.assert_series_equal(combined, series)
# Holes filled from input
combined = series_copy.combine_first(series)
assert np.isfinite(combined).all()
tm.assert_series_equal(combined[::2], series[::2])
tm.assert_series_equal(combined[1::2], series_copy[1::2])
# mixed types
index = pd.Index([str(i) for i in range(20)])
floats = Series(np.random.default_rng(2).standard_normal(20), index=index)
strings = Series([str(i) for i in range(10)], index=index[::2], dtype=object)
combined = strings.combine_first(floats)
tm.assert_series_equal(strings, combined.loc[index[::2]])
tm.assert_series_equal(floats[1::2].astype(object), combined.loc[index[1::2]])
# corner case
ser = Series([1.0, 2, 3], index=[0, 1, 2])
empty = Series([], index=[], dtype=object)
msg = "The behavior of array concatenation with empty entries is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = ser.combine_first(empty)
ser.index = ser.index.astype("O")
tm.assert_series_equal(ser, result)
def test_combine_first_dt64(self, unit):
s0 = to_datetime(Series(["2010", np.nan])).dt.as_unit(unit)
s1 = to_datetime(Series([np.nan, "2011"])).dt.as_unit(unit)
rs = s0.combine_first(s1)
xp = to_datetime(Series(["2010", "2011"])).dt.as_unit(unit)
tm.assert_series_equal(rs, xp)
s0 = to_datetime(Series(["2010", np.nan])).dt.as_unit(unit)
s1 = Series([np.nan, "2011"])
rs = s0.combine_first(s1)
xp = Series([datetime(2010, 1, 1), "2011"], dtype="datetime64[ns]")
tm.assert_series_equal(rs, xp)
def test_combine_first_dt_tz_values(self, tz_naive_fixture):
ser1 = Series(
pd.DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture),
name="ser1",
)
ser2 = Series(
pd.DatetimeIndex(["20160514", "20160515", "20160516"], tz=tz_naive_fixture),
index=[2, 3, 4],
name="ser2",
)
result = ser1.combine_first(ser2)
exp_vals = pd.DatetimeIndex(
["20150101", "20150102", "20150103", "20160515", "20160516"],
tz=tz_naive_fixture,
)
exp = Series(exp_vals, name="ser1")
tm.assert_series_equal(exp, result)
def test_combine_first_timezone_series_with_empty_series(self):
# GH 41800
time_index = date_range(
datetime(2021, 1, 1, 1),
datetime(2021, 1, 1, 10),
freq="h",
tz="Europe/Rome",
)
s1 = Series(range(10), index=time_index)
s2 = Series(index=time_index)
msg = "The behavior of array concatenation with empty entries is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = s1.combine_first(s2)
tm.assert_series_equal(result, s1)
def test_combine_first_preserves_dtype(self):
# GH51764
s1 = Series([1666880195890293744, 1666880195890293837])
s2 = Series([1, 2, 3])
result = s1.combine_first(s2)
expected = Series([1666880195890293744, 1666880195890293837, 3])
tm.assert_series_equal(result, expected)
def test_combine_mixed_timezone(self):
# GH 26283
uniform_tz = Series({pd.Timestamp("2019-05-01", tz="UTC"): 1.0})
multi_tz = Series(
{
pd.Timestamp("2019-05-01 01:00:00+0100", tz="Europe/London"): 2.0,
pd.Timestamp("2019-05-02", tz="UTC"): 3.0,
}
)
result = uniform_tz.combine_first(multi_tz)
expected = Series(
[1.0, 3.0],
index=pd.Index(
[
pd.Timestamp("2019-05-01 00:00:00+00:00", tz="UTC"),
pd.Timestamp("2019-05-02 00:00:00+00:00", tz="UTC"),
],
dtype="object",
),
)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,141 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize("align_axis", [0, 1, "index", "columns"])
def test_compare_axis(align_axis):
# GH#30429
s1 = pd.Series(["a", "b", "c"])
s2 = pd.Series(["x", "b", "z"])
result = s1.compare(s2, align_axis=align_axis)
if align_axis in (1, "columns"):
indices = pd.Index([0, 2])
columns = pd.Index(["self", "other"])
expected = pd.DataFrame(
[["a", "x"], ["c", "z"]], index=indices, columns=columns
)
tm.assert_frame_equal(result, expected)
else:
indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]])
expected = pd.Series(["a", "x", "c", "z"], index=indices)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"keep_shape, keep_equal",
[
(True, False),
(False, True),
(True, True),
# False, False case is already covered in test_compare_axis
],
)
def test_compare_various_formats(keep_shape, keep_equal):
s1 = pd.Series(["a", "b", "c"])
s2 = pd.Series(["x", "b", "z"])
result = s1.compare(s2, keep_shape=keep_shape, keep_equal=keep_equal)
if keep_shape:
indices = pd.Index([0, 1, 2])
columns = pd.Index(["self", "other"])
if keep_equal:
expected = pd.DataFrame(
[["a", "x"], ["b", "b"], ["c", "z"]], index=indices, columns=columns
)
else:
expected = pd.DataFrame(
[["a", "x"], [np.nan, np.nan], ["c", "z"]],
index=indices,
columns=columns,
)
else:
indices = pd.Index([0, 2])
columns = pd.Index(["self", "other"])
expected = pd.DataFrame(
[["a", "x"], ["c", "z"]], index=indices, columns=columns
)
tm.assert_frame_equal(result, expected)
def test_compare_with_equal_nulls():
# We want to make sure two NaNs are considered the same
# and dropped where applicable
s1 = pd.Series(["a", "b", np.nan])
s2 = pd.Series(["x", "b", np.nan])
result = s1.compare(s2)
expected = pd.DataFrame([["a", "x"]], columns=["self", "other"])
tm.assert_frame_equal(result, expected)
def test_compare_with_non_equal_nulls():
# We want to make sure the relevant NaNs do not get dropped
s1 = pd.Series(["a", "b", "c"])
s2 = pd.Series(["x", "b", np.nan])
result = s1.compare(s2, align_axis=0)
indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]])
expected = pd.Series(["a", "x", "c", np.nan], index=indices)
tm.assert_series_equal(result, expected)
def test_compare_multi_index():
index = pd.MultiIndex.from_arrays([[0, 0, 1], [0, 1, 2]])
s1 = pd.Series(["a", "b", "c"], index=index)
s2 = pd.Series(["x", "b", "z"], index=index)
result = s1.compare(s2, align_axis=0)
indices = pd.MultiIndex.from_arrays(
[[0, 0, 1, 1], [0, 0, 2, 2], ["self", "other", "self", "other"]]
)
expected = pd.Series(["a", "x", "c", "z"], index=indices)
tm.assert_series_equal(result, expected)
def test_compare_unaligned_objects():
# test Series with different indices
msg = "Can only compare identically-labeled Series objects"
with pytest.raises(ValueError, match=msg):
ser1 = pd.Series([1, 2, 3], index=["a", "b", "c"])
ser2 = pd.Series([1, 2, 3], index=["a", "b", "d"])
ser1.compare(ser2)
# test Series with different lengths
msg = "Can only compare identically-labeled Series objects"
with pytest.raises(ValueError, match=msg):
ser1 = pd.Series([1, 2, 3])
ser2 = pd.Series([1, 2, 3, 4])
ser1.compare(ser2)
def test_compare_datetime64_and_string():
# Issue https://github.com/pandas-dev/pandas/issues/45506
# Catch OverflowError when comparing datetime64 and string
data = [
{"a": "2015-07-01", "b": "08335394550"},
{"a": "2015-07-02", "b": "+49 (0) 0345 300033"},
{"a": "2015-07-03", "b": "+49(0)2598 04457"},
{"a": "2015-07-04", "b": "0741470003"},
{"a": "2015-07-05", "b": "04181 83668"},
]
dtypes = {"a": "datetime64[ns]", "b": "string"}
df = pd.DataFrame(data=data).astype(dtypes)
result_eq1 = df["a"].eq(df["b"])
result_eq2 = df["a"] == df["b"]
result_neq = df["a"] != df["b"]
expected_eq = pd.Series([False] * 5) # For .eq and ==
expected_neq = pd.Series([True] * 5) # For !=
tm.assert_series_equal(result_eq1, expected_eq)
tm.assert_series_equal(result_eq2, expected_eq)
tm.assert_series_equal(result_neq, expected_neq)

View File

@ -0,0 +1,306 @@
from itertools import product
import numpy as np
import pytest
from pandas._libs import lib
import pandas as pd
import pandas._testing as tm
# Each test case consists of a tuple with the data and dtype to create the
# test Series, the default dtype for the expected result (which is valid
# for most cases), and the specific cases where the result deviates from
# this default. Those overrides are defined as a dict with (keyword, val) as
# dictionary key. In case of multiple items, the last override takes precedence.
@pytest.fixture(
params=[
(
# data
[1, 2, 3],
# original dtype
np.dtype("int32"),
# default expected dtype
"Int32",
# exceptions on expected dtype
{("convert_integer", False): np.dtype("int32")},
),
(
[1, 2, 3],
np.dtype("int64"),
"Int64",
{("convert_integer", False): np.dtype("int64")},
),
(
["x", "y", "z"],
np.dtype("O"),
pd.StringDtype(),
{("convert_string", False): np.dtype("O")},
),
(
[True, False, np.nan],
np.dtype("O"),
pd.BooleanDtype(),
{("convert_boolean", False): np.dtype("O")},
),
(
["h", "i", np.nan],
np.dtype("O"),
pd.StringDtype(),
{("convert_string", False): np.dtype("O")},
),
( # GH32117
["h", "i", 1],
np.dtype("O"),
np.dtype("O"),
{},
),
(
[10, np.nan, 20],
np.dtype("float"),
"Int64",
{
("convert_integer", False, "convert_floating", True): "Float64",
("convert_integer", False, "convert_floating", False): np.dtype(
"float"
),
},
),
(
[np.nan, 100.5, 200],
np.dtype("float"),
"Float64",
{("convert_floating", False): np.dtype("float")},
),
(
[3, 4, 5],
"Int8",
"Int8",
{},
),
(
[[1, 2], [3, 4], [5]],
None,
np.dtype("O"),
{},
),
(
[4, 5, 6],
np.dtype("uint32"),
"UInt32",
{("convert_integer", False): np.dtype("uint32")},
),
(
[-10, 12, 13],
np.dtype("i1"),
"Int8",
{("convert_integer", False): np.dtype("i1")},
),
(
[1.2, 1.3],
np.dtype("float32"),
"Float32",
{("convert_floating", False): np.dtype("float32")},
),
(
[1, 2.0],
object,
"Int64",
{
("convert_integer", False): "Float64",
("convert_integer", False, "convert_floating", False): np.dtype(
"float"
),
("infer_objects", False): np.dtype("object"),
},
),
(
[1, 2.5],
object,
"Float64",
{
("convert_floating", False): np.dtype("float"),
("infer_objects", False): np.dtype("object"),
},
),
(["a", "b"], pd.CategoricalDtype(), pd.CategoricalDtype(), {}),
(
pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("s"),
pd.DatetimeTZDtype(tz="UTC"),
pd.DatetimeTZDtype(tz="UTC"),
{},
),
(
pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ms"),
pd.DatetimeTZDtype(tz="UTC"),
pd.DatetimeTZDtype(tz="UTC"),
{},
),
(
pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("us"),
pd.DatetimeTZDtype(tz="UTC"),
pd.DatetimeTZDtype(tz="UTC"),
{},
),
(
pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ns"),
pd.DatetimeTZDtype(tz="UTC"),
pd.DatetimeTZDtype(tz="UTC"),
{},
),
(
pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ns"),
"datetime64[ns]",
np.dtype("datetime64[ns]"),
{},
),
(
pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ns"),
object,
np.dtype("datetime64[ns]"),
{("infer_objects", False): np.dtype("object")},
),
(
pd.period_range("1/1/2011", freq="M", periods=3),
None,
pd.PeriodDtype("M"),
{},
),
(
pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]),
None,
pd.IntervalDtype("int64", "right"),
{},
),
]
)
def test_cases(request):
return request.param
class TestSeriesConvertDtypes:
@pytest.mark.parametrize("params", product(*[(True, False)] * 5))
def test_convert_dtypes(
self,
test_cases,
params,
using_infer_string,
):
data, maindtype, expected_default, expected_other = test_cases
if (
hasattr(data, "dtype")
and lib.is_np_dtype(data.dtype, "M")
and isinstance(maindtype, pd.DatetimeTZDtype)
):
# this astype is deprecated in favor of tz_localize
msg = "Cannot use .astype to convert from timezone-naive dtype"
with pytest.raises(TypeError, match=msg):
pd.Series(data, dtype=maindtype)
return
if maindtype is not None:
series = pd.Series(data, dtype=maindtype)
else:
series = pd.Series(data)
result = series.convert_dtypes(*params)
param_names = [
"infer_objects",
"convert_string",
"convert_integer",
"convert_boolean",
"convert_floating",
]
params_dict = dict(zip(param_names, params))
expected_dtype = expected_default
for spec, dtype in expected_other.items():
if all(params_dict[key] is val for key, val in zip(spec[::2], spec[1::2])):
expected_dtype = dtype
if (
using_infer_string
and expected_default == "string"
and expected_dtype == object
and params[0]
and not params[1]
):
# If we would convert with convert strings then infer_objects converts
# with the option
expected_dtype = "string[pyarrow_numpy]"
expected = pd.Series(data, dtype=expected_dtype)
tm.assert_series_equal(result, expected)
# Test that it is a copy
copy = series.copy(deep=True)
if result.notna().sum() > 0 and result.dtype in ["interval[int64, right]"]:
with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"):
result[result.notna()] = np.nan
else:
result[result.notna()] = np.nan
# Make sure original not changed
tm.assert_series_equal(series, copy)
def test_convert_string_dtype(self, nullable_string_dtype):
# https://github.com/pandas-dev/pandas/issues/31731 -> converting columns
# that are already string dtype
df = pd.DataFrame(
{"A": ["a", "b", pd.NA], "B": ["ä", "ö", "ü"]}, dtype=nullable_string_dtype
)
result = df.convert_dtypes()
tm.assert_frame_equal(df, result)
def test_convert_bool_dtype(self):
# GH32287
df = pd.DataFrame({"A": pd.array([True])})
tm.assert_frame_equal(df, df.convert_dtypes())
def test_convert_byte_string_dtype(self):
# GH-43183
byte_str = b"binary-string"
df = pd.DataFrame(data={"A": byte_str}, index=[0])
result = df.convert_dtypes()
expected = df
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"infer_objects, dtype", [(True, "Int64"), (False, "object")]
)
def test_convert_dtype_object_with_na(self, infer_objects, dtype):
# GH#48791
ser = pd.Series([1, pd.NA])
result = ser.convert_dtypes(infer_objects=infer_objects)
expected = pd.Series([1, pd.NA], dtype=dtype)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"infer_objects, dtype", [(True, "Float64"), (False, "object")]
)
def test_convert_dtype_object_with_na_float(self, infer_objects, dtype):
# GH#48791
ser = pd.Series([1.5, pd.NA])
result = ser.convert_dtypes(infer_objects=infer_objects)
expected = pd.Series([1.5, pd.NA], dtype=dtype)
tm.assert_series_equal(result, expected)
def test_convert_dtypes_pyarrow_to_np_nullable(self):
# GH 53648
pytest.importorskip("pyarrow")
ser = pd.Series(range(2), dtype="int32[pyarrow]")
result = ser.convert_dtypes(dtype_backend="numpy_nullable")
expected = pd.Series(range(2), dtype="Int32")
tm.assert_series_equal(result, expected)
def test_convert_dtypes_pyarrow_null(self):
# GH#55346
pa = pytest.importorskip("pyarrow")
ser = pd.Series([None, None])
result = ser.convert_dtypes(dtype_backend="pyarrow")
expected = pd.Series([None, None], dtype=pd.ArrowDtype(pa.null()))
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,91 @@
import numpy as np
import pytest
from pandas import (
Series,
Timestamp,
)
import pandas._testing as tm
class TestCopy:
@pytest.mark.parametrize("deep", ["default", None, False, True])
def test_copy(self, deep, using_copy_on_write, warn_copy_on_write):
ser = Series(np.arange(10), dtype="float64")
# default deep is True
if deep == "default":
ser2 = ser.copy()
else:
ser2 = ser.copy(deep=deep)
if using_copy_on_write:
# INFO(CoW) a shallow copy doesn't yet copy the data
# but parent will not be modified (CoW)
if deep is None or deep is False:
assert np.may_share_memory(ser.values, ser2.values)
else:
assert not np.may_share_memory(ser.values, ser2.values)
with tm.assert_cow_warning(warn_copy_on_write and deep is False):
ser2[::2] = np.nan
if deep is not False or using_copy_on_write:
# Did not modify original Series
assert np.isnan(ser2[0])
assert not np.isnan(ser[0])
else:
# we DID modify the original Series
assert np.isnan(ser2[0])
assert np.isnan(ser[0])
@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning")
@pytest.mark.parametrize("deep", ["default", None, False, True])
def test_copy_tzaware(self, deep, using_copy_on_write):
# GH#11794
# copy of tz-aware
expected = Series([Timestamp("2012/01/01", tz="UTC")])
expected2 = Series([Timestamp("1999/01/01", tz="UTC")])
ser = Series([Timestamp("2012/01/01", tz="UTC")])
if deep == "default":
ser2 = ser.copy()
else:
ser2 = ser.copy(deep=deep)
if using_copy_on_write:
# INFO(CoW) a shallow copy doesn't yet copy the data
# but parent will not be modified (CoW)
if deep is None or deep is False:
assert np.may_share_memory(ser.values, ser2.values)
else:
assert not np.may_share_memory(ser.values, ser2.values)
ser2[0] = Timestamp("1999/01/01", tz="UTC")
# default deep is True
if deep is not False or using_copy_on_write:
# Did not modify original Series
tm.assert_series_equal(ser2, expected2)
tm.assert_series_equal(ser, expected)
else:
# we DID modify the original Series
tm.assert_series_equal(ser2, expected2)
tm.assert_series_equal(ser, expected2)
def test_copy_name(self, datetime_series):
result = datetime_series.copy()
assert result.name == datetime_series.name
def test_copy_index_name_checking(self, datetime_series):
# don't want to be able to modify the index stored elsewhere after
# making a copy
datetime_series.index.name = None
assert datetime_series.index.name is None
assert datetime_series is datetime_series
cp = datetime_series.copy()
cp.index.name = "foo"
assert datetime_series.index.name is None

View File

@ -0,0 +1,34 @@
import numpy as np
import pandas as pd
from pandas import (
Categorical,
Series,
)
import pandas._testing as tm
class TestSeriesCount:
def test_count(self, datetime_series):
assert datetime_series.count() == len(datetime_series)
datetime_series[::2] = np.nan
assert datetime_series.count() == np.isfinite(datetime_series).sum()
def test_count_inf_as_na(self):
# GH#29478
ser = Series([pd.Timestamp("1990/1/1")])
msg = "use_inf_as_na option is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
with pd.option_context("use_inf_as_na", True):
assert ser.count() == 1
def test_count_categorical(self):
ser = Series(
Categorical(
[np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True
)
)
result = ser.count()
assert result == 2

View File

@ -0,0 +1,185 @@
import math
import numpy as np
import pytest
import pandas as pd
from pandas import (
Series,
date_range,
isna,
)
import pandas._testing as tm
class TestSeriesCov:
def test_cov(self, datetime_series):
# full overlap
tm.assert_almost_equal(
datetime_series.cov(datetime_series), datetime_series.std() ** 2
)
# partial overlap
tm.assert_almost_equal(
datetime_series[:15].cov(datetime_series[5:]),
datetime_series[5:15].std() ** 2,
)
# No overlap
assert np.isnan(datetime_series[::2].cov(datetime_series[1::2]))
# all NA
cp = datetime_series[:10].copy()
cp[:] = np.nan
assert isna(cp.cov(cp))
# min_periods
assert isna(datetime_series[:15].cov(datetime_series[5:], min_periods=12))
ts1 = datetime_series[:15].reindex(datetime_series.index)
ts2 = datetime_series[5:].reindex(datetime_series.index)
assert isna(ts1.cov(ts2, min_periods=12))
@pytest.mark.parametrize("test_ddof", [None, 0, 1, 2, 3])
@pytest.mark.parametrize("dtype", ["float64", "Float64"])
def test_cov_ddof(self, test_ddof, dtype):
# GH#34611
np_array1 = np.random.default_rng(2).random(10)
np_array2 = np.random.default_rng(2).random(10)
s1 = Series(np_array1, dtype=dtype)
s2 = Series(np_array2, dtype=dtype)
result = s1.cov(s2, ddof=test_ddof)
expected = np.cov(np_array1, np_array2, ddof=test_ddof)[0][1]
assert math.isclose(expected, result)
class TestSeriesCorr:
@pytest.mark.parametrize("dtype", ["float64", "Float64"])
def test_corr(self, datetime_series, dtype):
stats = pytest.importorskip("scipy.stats")
datetime_series = datetime_series.astype(dtype)
# full overlap
tm.assert_almost_equal(datetime_series.corr(datetime_series), 1)
# partial overlap
tm.assert_almost_equal(datetime_series[:15].corr(datetime_series[5:]), 1)
assert isna(datetime_series[:15].corr(datetime_series[5:], min_periods=12))
ts1 = datetime_series[:15].reindex(datetime_series.index)
ts2 = datetime_series[5:].reindex(datetime_series.index)
assert isna(ts1.corr(ts2, min_periods=12))
# No overlap
assert np.isnan(datetime_series[::2].corr(datetime_series[1::2]))
# all NA
cp = datetime_series[:10].copy()
cp[:] = np.nan
assert isna(cp.corr(cp))
A = Series(
np.arange(10, dtype=np.float64),
index=date_range("2020-01-01", periods=10),
name="ts",
)
B = A.copy()
result = A.corr(B)
expected, _ = stats.pearsonr(A, B)
tm.assert_almost_equal(result, expected)
def test_corr_rank(self):
stats = pytest.importorskip("scipy.stats")
# kendall and spearman
A = Series(
np.arange(10, dtype=np.float64),
index=date_range("2020-01-01", periods=10),
name="ts",
)
B = A.copy()
A[-5:] = A[:5].copy()
result = A.corr(B, method="kendall")
expected = stats.kendalltau(A, B)[0]
tm.assert_almost_equal(result, expected)
result = A.corr(B, method="spearman")
expected = stats.spearmanr(A, B)[0]
tm.assert_almost_equal(result, expected)
# results from R
A = Series(
[
-0.89926396,
0.94209606,
-1.03289164,
-0.95445587,
0.76910310,
-0.06430576,
-2.09704447,
0.40660407,
-0.89926396,
0.94209606,
]
)
B = Series(
[
-1.01270225,
-0.62210117,
-1.56895827,
0.59592943,
-0.01680292,
1.17258718,
-1.06009347,
-0.10222060,
-0.89076239,
0.89372375,
]
)
kexp = 0.4319297
sexp = 0.5853767
tm.assert_almost_equal(A.corr(B, method="kendall"), kexp)
tm.assert_almost_equal(A.corr(B, method="spearman"), sexp)
def test_corr_invalid_method(self):
# GH PR #22298
s1 = Series(np.random.default_rng(2).standard_normal(10))
s2 = Series(np.random.default_rng(2).standard_normal(10))
msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, "
with pytest.raises(ValueError, match=msg):
s1.corr(s2, method="____")
def test_corr_callable_method(self, datetime_series):
# simple correlation example
# returns 1 if exact equality, 0 otherwise
my_corr = lambda a, b: 1.0 if (a == b).all() else 0.0
# simple example
s1 = Series([1, 2, 3, 4, 5])
s2 = Series([5, 4, 3, 2, 1])
expected = 0
tm.assert_almost_equal(s1.corr(s2, method=my_corr), expected)
# full overlap
tm.assert_almost_equal(
datetime_series.corr(datetime_series, method=my_corr), 1.0
)
# partial overlap
tm.assert_almost_equal(
datetime_series[:15].corr(datetime_series[5:], method=my_corr), 1.0
)
# No overlap
assert np.isnan(
datetime_series[::2].corr(datetime_series[1::2], method=my_corr)
)
# dataframe example
df = pd.DataFrame([s1, s2])
expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}])
tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected)

View File

@ -0,0 +1,203 @@
import numpy as np
import pytest
from pandas.compat.numpy import np_version_gte1p25
from pandas.core.dtypes.common import (
is_complex_dtype,
is_extension_array_dtype,
)
from pandas import (
NA,
Period,
Series,
Timedelta,
Timestamp,
date_range,
)
import pandas._testing as tm
class TestSeriesDescribe:
def test_describe_ints(self):
ser = Series([0, 1, 2, 3, 4], name="int_data")
result = ser.describe()
expected = Series(
[5, 2, ser.std(), 0, 1, 2, 3, 4],
name="int_data",
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
)
tm.assert_series_equal(result, expected)
def test_describe_bools(self):
ser = Series([True, True, False, False, False], name="bool_data")
result = ser.describe()
expected = Series(
[5, 2, False, 3], name="bool_data", index=["count", "unique", "top", "freq"]
)
tm.assert_series_equal(result, expected)
def test_describe_strs(self):
ser = Series(["a", "a", "b", "c", "d"], name="str_data")
result = ser.describe()
expected = Series(
[5, 4, "a", 2], name="str_data", index=["count", "unique", "top", "freq"]
)
tm.assert_series_equal(result, expected)
def test_describe_timedelta64(self):
ser = Series(
[
Timedelta("1 days"),
Timedelta("2 days"),
Timedelta("3 days"),
Timedelta("4 days"),
Timedelta("5 days"),
],
name="timedelta_data",
)
result = ser.describe()
expected = Series(
[5, ser[2], ser.std(), ser[0], ser[1], ser[2], ser[3], ser[4]],
name="timedelta_data",
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
)
tm.assert_series_equal(result, expected)
def test_describe_period(self):
ser = Series(
[Period("2020-01", "M"), Period("2020-01", "M"), Period("2019-12", "M")],
name="period_data",
)
result = ser.describe()
expected = Series(
[3, 2, ser[0], 2],
name="period_data",
index=["count", "unique", "top", "freq"],
)
tm.assert_series_equal(result, expected)
def test_describe_empty_object(self):
# https://github.com/pandas-dev/pandas/issues/27183
s = Series([None, None], dtype=object)
result = s.describe()
expected = Series(
[0, 0, np.nan, np.nan],
dtype=object,
index=["count", "unique", "top", "freq"],
)
tm.assert_series_equal(result, expected)
result = s[:0].describe()
tm.assert_series_equal(result, expected)
# ensure NaN, not None
assert np.isnan(result.iloc[2])
assert np.isnan(result.iloc[3])
def test_describe_with_tz(self, tz_naive_fixture):
# GH 21332
tz = tz_naive_fixture
name = str(tz_naive_fixture)
start = Timestamp(2018, 1, 1)
end = Timestamp(2018, 1, 5)
s = Series(date_range(start, end, tz=tz), name=name)
result = s.describe()
expected = Series(
[
5,
Timestamp(2018, 1, 3).tz_localize(tz),
start.tz_localize(tz),
s[1],
s[2],
s[3],
end.tz_localize(tz),
],
name=name,
index=["count", "mean", "min", "25%", "50%", "75%", "max"],
)
tm.assert_series_equal(result, expected)
def test_describe_with_tz_numeric(self):
name = tz = "CET"
start = Timestamp(2018, 1, 1)
end = Timestamp(2018, 1, 5)
s = Series(date_range(start, end, tz=tz), name=name)
result = s.describe()
expected = Series(
[
5,
Timestamp("2018-01-03 00:00:00", tz=tz),
Timestamp("2018-01-01 00:00:00", tz=tz),
Timestamp("2018-01-02 00:00:00", tz=tz),
Timestamp("2018-01-03 00:00:00", tz=tz),
Timestamp("2018-01-04 00:00:00", tz=tz),
Timestamp("2018-01-05 00:00:00", tz=tz),
],
name=name,
index=["count", "mean", "min", "25%", "50%", "75%", "max"],
)
tm.assert_series_equal(result, expected)
def test_datetime_is_numeric_includes_datetime(self):
s = Series(date_range("2012", periods=3))
result = s.describe()
expected = Series(
[
3,
Timestamp("2012-01-02"),
Timestamp("2012-01-01"),
Timestamp("2012-01-01T12:00:00"),
Timestamp("2012-01-02"),
Timestamp("2012-01-02T12:00:00"),
Timestamp("2012-01-03"),
],
index=["count", "mean", "min", "25%", "50%", "75%", "max"],
)
tm.assert_series_equal(result, expected)
@pytest.mark.filterwarnings("ignore:Casting complex values to real discards")
def test_numeric_result_dtype(self, any_numeric_dtype):
# GH#48340 - describe should always return float on non-complex numeric input
if is_extension_array_dtype(any_numeric_dtype):
dtype = "Float64"
else:
dtype = "complex128" if is_complex_dtype(any_numeric_dtype) else None
ser = Series([0, 1], dtype=any_numeric_dtype)
if dtype == "complex128" and np_version_gte1p25:
with pytest.raises(
TypeError, match=r"^a must be an array of real numbers$"
):
ser.describe()
return
result = ser.describe()
expected = Series(
[
2.0,
0.5,
ser.std(),
0,
0.25,
0.5,
0.75,
1.0,
],
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
dtype=dtype,
)
tm.assert_series_equal(result, expected)
def test_describe_one_element_ea(self):
# GH#52515
ser = Series([0.0], dtype="Float64")
with tm.assert_produces_warning(None):
result = ser.describe()
expected = Series(
[1, 0, NA, 0, 0, 0, 0, 0],
dtype="Float64",
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,88 @@
import numpy as np
import pytest
from pandas import (
Series,
TimedeltaIndex,
date_range,
)
import pandas._testing as tm
class TestSeriesDiff:
def test_diff_np(self):
# TODO(__array_function__): could make np.diff return a Series
# matching ser.diff()
ser = Series(np.arange(5))
res = np.diff(ser)
expected = np.array([1, 1, 1, 1])
tm.assert_numpy_array_equal(res, expected)
def test_diff_int(self):
# int dtype
a = 10000000000000000
b = a + 1
ser = Series([a, b])
result = ser.diff()
assert result[1] == 1
def test_diff_tz(self):
# Combined datetime diff, normal diff and boolean diff test
ts = Series(
np.arange(10, dtype=np.float64),
index=date_range("2020-01-01", periods=10),
name="ts",
)
ts.diff()
# neg n
result = ts.diff(-1)
expected = ts - ts.shift(-1)
tm.assert_series_equal(result, expected)
# 0
result = ts.diff(0)
expected = ts - ts
tm.assert_series_equal(result, expected)
def test_diff_dt64(self):
# datetime diff (GH#3100)
ser = Series(date_range("20130102", periods=5))
result = ser.diff()
expected = ser - ser.shift(1)
tm.assert_series_equal(result, expected)
# timedelta diff
result = result - result.shift(1) # previous result
expected = expected.diff() # previously expected
tm.assert_series_equal(result, expected)
def test_diff_dt64tz(self):
# with tz
ser = Series(
date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo"
)
result = ser.diff()
expected = Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"input,output,diff",
[([False, True, True, False, False], [np.nan, True, False, True, False], 1)],
)
def test_diff_bool(self, input, output, diff):
# boolean series (test for fixing #17294)
ser = Series(input)
result = ser.diff()
expected = Series(output)
tm.assert_series_equal(result, expected)
def test_diff_object_dtype(self):
# object series
ser = Series([False, True, 5.0, np.nan, True, False])
result = ser.diff()
expected = ser - ser.shift(1)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,99 @@
import pytest
from pandas import (
Index,
Series,
)
import pandas._testing as tm
from pandas.api.types import is_bool_dtype
@pytest.mark.parametrize(
"data, index, drop_labels, axis, expected_data, expected_index",
[
# Unique Index
([1, 2], ["one", "two"], ["two"], 0, [1], ["one"]),
([1, 2], ["one", "two"], ["two"], "rows", [1], ["one"]),
([1, 1, 2], ["one", "two", "one"], ["two"], 0, [1, 2], ["one", "one"]),
# GH 5248 Non-Unique Index
([1, 1, 2], ["one", "two", "one"], "two", 0, [1, 2], ["one", "one"]),
([1, 1, 2], ["one", "two", "one"], ["one"], 0, [1], ["two"]),
([1, 1, 2], ["one", "two", "one"], "one", 0, [1], ["two"]),
],
)
def test_drop_unique_and_non_unique_index(
data, index, axis, drop_labels, expected_data, expected_index
):
ser = Series(data=data, index=index)
result = ser.drop(drop_labels, axis=axis)
expected = Series(data=expected_data, index=expected_index)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"data, index, drop_labels, axis, error_type, error_desc",
[
# single string/tuple-like
(range(3), list("abc"), "bc", 0, KeyError, "not found in axis"),
# bad axis
(range(3), list("abc"), ("a",), 0, KeyError, "not found in axis"),
(range(3), list("abc"), "one", "columns", ValueError, "No axis named columns"),
],
)
def test_drop_exception_raised(data, index, drop_labels, axis, error_type, error_desc):
ser = Series(data, index=index)
with pytest.raises(error_type, match=error_desc):
ser.drop(drop_labels, axis=axis)
def test_drop_with_ignore_errors():
# errors='ignore'
ser = Series(range(3), index=list("abc"))
result = ser.drop("bc", errors="ignore")
tm.assert_series_equal(result, ser)
result = ser.drop(["a", "d"], errors="ignore")
expected = ser.iloc[1:]
tm.assert_series_equal(result, expected)
# GH 8522
ser = Series([2, 3], index=[True, False])
assert is_bool_dtype(ser.index)
assert ser.index.dtype == bool
result = ser.drop(True)
expected = Series([3], index=[False])
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 3]])
@pytest.mark.parametrize("drop_labels", [[], [1], [3]])
def test_drop_empty_list(index, drop_labels):
# GH 21494
expected_index = [i for i in index if i not in drop_labels]
series = Series(index=index, dtype=object).drop(drop_labels)
expected = Series(index=expected_index, dtype=object)
tm.assert_series_equal(series, expected)
@pytest.mark.parametrize(
"data, index, drop_labels",
[
(None, [1, 2, 3], [1, 4]),
(None, [1, 2, 2], [1, 4]),
([2, 3], [0, 1], [False, True]),
],
)
def test_drop_non_empty_list(data, index, drop_labels):
# GH 21494 and GH 16877
dtype = object if data is None else None
ser = Series(data=data, index=index, dtype=dtype)
with pytest.raises(KeyError, match="not found in axis"):
ser.drop(drop_labels)
def test_drop_index_ea_dtype(any_numeric_ea_dtype):
# GH#45860
df = Series(100, index=Index([1, 2, 2], dtype=any_numeric_ea_dtype))
idx = Index([df.index[1]])
result = df.drop(idx)
expected = Series(100, index=Index([1], dtype=any_numeric_ea_dtype))
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,267 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
Series,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"keep, expected",
[
("first", Series([False, False, False, False, True, True, False])),
("last", Series([False, True, True, False, False, False, False])),
(False, Series([False, True, True, False, True, True, False])),
],
)
def test_drop_duplicates(any_numpy_dtype, keep, expected):
tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype))
if tc.dtype == "bool":
pytest.skip("tested separately in test_drop_duplicates_bool")
tm.assert_series_equal(tc.duplicated(keep=keep), expected)
tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
sc = tc.copy()
return_value = sc.drop_duplicates(keep=keep, inplace=True)
assert return_value is None
tm.assert_series_equal(sc, tc[~expected])
@pytest.mark.parametrize(
"keep, expected",
[
("first", Series([False, False, True, True])),
("last", Series([True, True, False, False])),
(False, Series([True, True, True, True])),
],
)
def test_drop_duplicates_bool(keep, expected):
tc = Series([True, False, True, False])
tm.assert_series_equal(tc.duplicated(keep=keep), expected)
tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
sc = tc.copy()
return_value = sc.drop_duplicates(keep=keep, inplace=True)
tm.assert_series_equal(sc, tc[~expected])
assert return_value is None
@pytest.mark.parametrize("values", [[], list(range(5))])
def test_drop_duplicates_no_duplicates(any_numpy_dtype, keep, values):
tc = Series(values, dtype=np.dtype(any_numpy_dtype))
expected = Series([False] * len(tc), dtype="bool")
if tc.dtype == "bool":
# 0 -> False and 1-> True
# any other value would be duplicated
tc = tc[:2]
expected = expected[:2]
tm.assert_series_equal(tc.duplicated(keep=keep), expected)
result_dropped = tc.drop_duplicates(keep=keep)
tm.assert_series_equal(result_dropped, tc)
# validate shallow copy
assert result_dropped is not tc
class TestSeriesDropDuplicates:
@pytest.fixture(
params=["int_", "uint", "float64", "str_", "timedelta64[h]", "datetime64[D]"]
)
def dtype(self, request):
return request.param
@pytest.fixture
def cat_series_unused_category(self, dtype, ordered):
# Test case 1
cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))
input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype))
cat = Categorical(input1, categories=cat_array, ordered=ordered)
tc1 = Series(cat)
return tc1
def test_drop_duplicates_categorical_non_bool(self, cat_series_unused_category):
tc1 = cat_series_unused_category
expected = Series([False, False, False, True])
result = tc1.duplicated()
tm.assert_series_equal(result, expected)
result = tc1.drop_duplicates()
tm.assert_series_equal(result, tc1[~expected])
sc = tc1.copy()
return_value = sc.drop_duplicates(inplace=True)
assert return_value is None
tm.assert_series_equal(sc, tc1[~expected])
def test_drop_duplicates_categorical_non_bool_keeplast(
self, cat_series_unused_category
):
tc1 = cat_series_unused_category
expected = Series([False, False, True, False])
result = tc1.duplicated(keep="last")
tm.assert_series_equal(result, expected)
result = tc1.drop_duplicates(keep="last")
tm.assert_series_equal(result, tc1[~expected])
sc = tc1.copy()
return_value = sc.drop_duplicates(keep="last", inplace=True)
assert return_value is None
tm.assert_series_equal(sc, tc1[~expected])
def test_drop_duplicates_categorical_non_bool_keepfalse(
self, cat_series_unused_category
):
tc1 = cat_series_unused_category
expected = Series([False, False, True, True])
result = tc1.duplicated(keep=False)
tm.assert_series_equal(result, expected)
result = tc1.drop_duplicates(keep=False)
tm.assert_series_equal(result, tc1[~expected])
sc = tc1.copy()
return_value = sc.drop_duplicates(keep=False, inplace=True)
assert return_value is None
tm.assert_series_equal(sc, tc1[~expected])
@pytest.fixture
def cat_series(self, dtype, ordered):
# no unused categories, unlike cat_series_unused_category
cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))
input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype))
cat = Categorical(input2, categories=cat_array, ordered=ordered)
tc2 = Series(cat)
return tc2
def test_drop_duplicates_categorical_non_bool2(self, cat_series):
tc2 = cat_series
expected = Series([False, False, False, False, True, True, False])
result = tc2.duplicated()
tm.assert_series_equal(result, expected)
result = tc2.drop_duplicates()
tm.assert_series_equal(result, tc2[~expected])
sc = tc2.copy()
return_value = sc.drop_duplicates(inplace=True)
assert return_value is None
tm.assert_series_equal(sc, tc2[~expected])
def test_drop_duplicates_categorical_non_bool2_keeplast(self, cat_series):
tc2 = cat_series
expected = Series([False, True, True, False, False, False, False])
result = tc2.duplicated(keep="last")
tm.assert_series_equal(result, expected)
result = tc2.drop_duplicates(keep="last")
tm.assert_series_equal(result, tc2[~expected])
sc = tc2.copy()
return_value = sc.drop_duplicates(keep="last", inplace=True)
assert return_value is None
tm.assert_series_equal(sc, tc2[~expected])
def test_drop_duplicates_categorical_non_bool2_keepfalse(self, cat_series):
tc2 = cat_series
expected = Series([False, True, True, False, True, True, False])
result = tc2.duplicated(keep=False)
tm.assert_series_equal(result, expected)
result = tc2.drop_duplicates(keep=False)
tm.assert_series_equal(result, tc2[~expected])
sc = tc2.copy()
return_value = sc.drop_duplicates(keep=False, inplace=True)
assert return_value is None
tm.assert_series_equal(sc, tc2[~expected])
def test_drop_duplicates_categorical_bool(self, ordered):
tc = Series(
Categorical(
[True, False, True, False], categories=[True, False], ordered=ordered
)
)
expected = Series([False, False, True, True])
tm.assert_series_equal(tc.duplicated(), expected)
tm.assert_series_equal(tc.drop_duplicates(), tc[~expected])
sc = tc.copy()
return_value = sc.drop_duplicates(inplace=True)
assert return_value is None
tm.assert_series_equal(sc, tc[~expected])
expected = Series([True, True, False, False])
tm.assert_series_equal(tc.duplicated(keep="last"), expected)
tm.assert_series_equal(tc.drop_duplicates(keep="last"), tc[~expected])
sc = tc.copy()
return_value = sc.drop_duplicates(keep="last", inplace=True)
assert return_value is None
tm.assert_series_equal(sc, tc[~expected])
expected = Series([True, True, True, True])
tm.assert_series_equal(tc.duplicated(keep=False), expected)
tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected])
sc = tc.copy()
return_value = sc.drop_duplicates(keep=False, inplace=True)
assert return_value is None
tm.assert_series_equal(sc, tc[~expected])
def test_drop_duplicates_categorical_bool_na(self, nulls_fixture):
# GH#44351
ser = Series(
Categorical(
[True, False, True, False, nulls_fixture],
categories=[True, False],
ordered=True,
)
)
result = ser.drop_duplicates()
expected = Series(
Categorical([True, False, np.nan], categories=[True, False], ordered=True),
index=[0, 1, 4],
)
tm.assert_series_equal(result, expected)
def test_drop_duplicates_ignore_index(self):
# GH#48304
ser = Series([1, 2, 2, 3])
result = ser.drop_duplicates(ignore_index=True)
expected = Series([1, 2, 3])
tm.assert_series_equal(result, expected)
def test_duplicated_arrow_dtype(self):
pytest.importorskip("pyarrow")
ser = Series([True, False, None, False], dtype="bool[pyarrow]")
result = ser.drop_duplicates()
expected = Series([True, False, None], dtype="bool[pyarrow]")
tm.assert_series_equal(result, expected)
def test_drop_duplicates_arrow_strings(self):
# GH#54904
pa = pytest.importorskip("pyarrow")
ser = Series(["a", "a"], dtype=pd.ArrowDtype(pa.string()))
result = ser.drop_duplicates()
expecetd = Series(["a"], dtype=pd.ArrowDtype(pa.string()))
tm.assert_series_equal(result, expecetd)

View File

@ -0,0 +1,117 @@
import numpy as np
import pytest
from pandas import (
DatetimeIndex,
IntervalIndex,
NaT,
Period,
Series,
Timestamp,
)
import pandas._testing as tm
class TestDropna:
def test_dropna_empty(self):
ser = Series([], dtype=object)
assert len(ser.dropna()) == 0
return_value = ser.dropna(inplace=True)
assert return_value is None
assert len(ser) == 0
# invalid axis
msg = "No axis named 1 for object type Series"
with pytest.raises(ValueError, match=msg):
ser.dropna(axis=1)
def test_dropna_preserve_name(self, datetime_series):
datetime_series[:5] = np.nan
result = datetime_series.dropna()
assert result.name == datetime_series.name
name = datetime_series.name
ts = datetime_series.copy()
return_value = ts.dropna(inplace=True)
assert return_value is None
assert ts.name == name
def test_dropna_no_nan(self):
for ser in [
Series([1, 2, 3], name="x"),
Series([False, True, False], name="x"),
]:
result = ser.dropna()
tm.assert_series_equal(result, ser)
assert result is not ser
s2 = ser.copy()
return_value = s2.dropna(inplace=True)
assert return_value is None
tm.assert_series_equal(s2, ser)
def test_dropna_intervals(self):
ser = Series(
[np.nan, 1, 2, 3],
IntervalIndex.from_arrays([np.nan, 0, 1, 2], [np.nan, 1, 2, 3]),
)
result = ser.dropna()
expected = ser.iloc[1:]
tm.assert_series_equal(result, expected)
def test_dropna_period_dtype(self):
# GH#13737
ser = Series([Period("2011-01", freq="M"), Period("NaT", freq="M")])
result = ser.dropna()
expected = Series([Period("2011-01", freq="M")])
tm.assert_series_equal(result, expected)
def test_datetime64_tz_dropna(self, unit):
# DatetimeLikeBlock
ser = Series(
[
Timestamp("2011-01-01 10:00"),
NaT,
Timestamp("2011-01-03 10:00"),
NaT,
],
dtype=f"M8[{unit}]",
)
result = ser.dropna()
expected = Series(
[Timestamp("2011-01-01 10:00"), Timestamp("2011-01-03 10:00")],
index=[0, 2],
dtype=f"M8[{unit}]",
)
tm.assert_series_equal(result, expected)
# DatetimeTZBlock
idx = DatetimeIndex(
["2011-01-01 10:00", NaT, "2011-01-03 10:00", NaT], tz="Asia/Tokyo"
).as_unit(unit)
ser = Series(idx)
assert ser.dtype == f"datetime64[{unit}, Asia/Tokyo]"
result = ser.dropna()
expected = Series(
[
Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"),
Timestamp("2011-01-03 10:00", tz="Asia/Tokyo"),
],
index=[0, 2],
dtype=f"datetime64[{unit}, Asia/Tokyo]",
)
assert result.dtype == f"datetime64[{unit}, Asia/Tokyo]"
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("val", [1, 1.5])
def test_dropna_ignore_index(self, val):
# GH#31725
ser = Series([1, 2, val], index=[3, 2, 1])
result = ser.dropna(ignore_index=True)
expected = Series([1, 2, val])
tm.assert_series_equal(result, expected)
ser.dropna(ignore_index=True, inplace=True)
tm.assert_series_equal(ser, expected)

View File

@ -0,0 +1,7 @@
import numpy as np
class TestSeriesDtypes:
def test_dtype(self, datetime_series):
assert datetime_series.dtype == np.dtype("float64")
assert datetime_series.dtypes == np.dtype("float64")

View File

@ -0,0 +1,77 @@
import numpy as np
import pytest
from pandas import (
NA,
Categorical,
Series,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"keep, expected",
[
("first", Series([False, False, True, False, True], name="name")),
("last", Series([True, True, False, False, False], name="name")),
(False, Series([True, True, True, False, True], name="name")),
],
)
def test_duplicated_keep(keep, expected):
ser = Series(["a", "b", "b", "c", "a"], name="name")
result = ser.duplicated(keep=keep)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"keep, expected",
[
("first", Series([False, False, True, False, True])),
("last", Series([True, True, False, False, False])),
(False, Series([True, True, True, False, True])),
],
)
def test_duplicated_nan_none(keep, expected):
ser = Series([np.nan, 3, 3, None, np.nan], dtype=object)
result = ser.duplicated(keep=keep)
tm.assert_series_equal(result, expected)
def test_duplicated_categorical_bool_na(nulls_fixture):
# GH#44351
ser = Series(
Categorical(
[True, False, True, False, nulls_fixture],
categories=[True, False],
ordered=True,
)
)
result = ser.duplicated()
expected = Series([False, False, True, True, False])
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"keep, vals",
[
("last", [True, True, False]),
("first", [False, True, True]),
(False, [True, True, True]),
],
)
def test_duplicated_mask(keep, vals):
# GH#48150
ser = Series([1, 2, NA, NA, NA], dtype="Int64")
result = ser.duplicated(keep=keep)
expected = Series([False, False] + vals)
tm.assert_series_equal(result, expected)
def test_duplicated_mask_no_duplicated_na(keep):
# GH#48150
ser = Series([1, 2, NA], dtype="Int64")
result = ser.duplicated(keep=keep)
expected = Series([False, False, False])
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,145 @@
from contextlib import nullcontext
import copy
import numpy as np
import pytest
from pandas._libs.missing import is_matching_na
from pandas.compat.numpy import np_version_gte1p25
from pandas.core.dtypes.common import is_float
from pandas import (
Index,
MultiIndex,
Series,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"arr, idx",
[
([1, 2, 3, 4], [0, 2, 1, 3]),
([1, np.nan, 3, np.nan], [0, 2, 1, 3]),
(
[1, np.nan, 3, np.nan],
MultiIndex.from_tuples([(0, "a"), (1, "b"), (2, "c"), (3, "c")]),
),
],
)
def test_equals(arr, idx):
s1 = Series(arr, index=idx)
s2 = s1.copy()
assert s1.equals(s2)
s1[1] = 9
assert not s1.equals(s2)
@pytest.mark.parametrize(
"val", [1, 1.1, 1 + 1j, True, "abc", [1, 2], (1, 2), {1, 2}, {"a": 1}, None]
)
def test_equals_list_array(val):
# GH20676 Verify equals operator for list of Numpy arrays
arr = np.array([1, 2])
s1 = Series([arr, arr])
s2 = s1.copy()
assert s1.equals(s2)
s1[1] = val
cm = (
tm.assert_produces_warning(FutureWarning, check_stacklevel=False)
if isinstance(val, str) and not np_version_gte1p25
else nullcontext()
)
with cm:
assert not s1.equals(s2)
def test_equals_false_negative():
# GH8437 Verify false negative behavior of equals function for dtype object
arr = [False, np.nan]
s1 = Series(arr)
s2 = s1.copy()
s3 = Series(index=range(2), dtype=object)
s4 = s3.copy()
s5 = s3.copy()
s6 = s3.copy()
s3[:-1] = s4[:-1] = s5[0] = s6[0] = False
assert s1.equals(s1)
assert s1.equals(s2)
assert s1.equals(s3)
assert s1.equals(s4)
assert s1.equals(s5)
assert s5.equals(s6)
def test_equals_matching_nas():
# matching but not identical NAs
left = Series([np.datetime64("NaT")], dtype=object)
right = Series([np.datetime64("NaT")], dtype=object)
assert left.equals(right)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
assert Index(left).equals(Index(right))
assert left.array.equals(right.array)
left = Series([np.timedelta64("NaT")], dtype=object)
right = Series([np.timedelta64("NaT")], dtype=object)
assert left.equals(right)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
assert Index(left).equals(Index(right))
assert left.array.equals(right.array)
left = Series([np.float64("NaN")], dtype=object)
right = Series([np.float64("NaN")], dtype=object)
assert left.equals(right)
assert Index(left, dtype=left.dtype).equals(Index(right, dtype=right.dtype))
assert left.array.equals(right.array)
def test_equals_mismatched_nas(nulls_fixture, nulls_fixture2):
# GH#39650
left = nulls_fixture
right = nulls_fixture2
if hasattr(right, "copy"):
right = right.copy()
else:
right = copy.copy(right)
ser = Series([left], dtype=object)
ser2 = Series([right], dtype=object)
if is_matching_na(left, right):
assert ser.equals(ser2)
elif (left is None and is_float(right)) or (right is None and is_float(left)):
assert ser.equals(ser2)
else:
assert not ser.equals(ser2)
def test_equals_none_vs_nan():
# GH#39650
ser = Series([1, None], dtype=object)
ser2 = Series([1, np.nan], dtype=object)
assert ser.equals(ser2)
assert Index(ser, dtype=ser.dtype).equals(Index(ser2, dtype=ser2.dtype))
assert ser.array.equals(ser2.array)
def test_equals_None_vs_float():
# GH#44190
left = Series([-np.inf, np.nan, -1.0, 0.0, 1.0, 10 / 3, np.inf], dtype=object)
right = Series([None] * len(left))
# these series were found to be equal due to a bug, check that they are correctly
# found to not equal
assert not left.equals(right)
assert not right.equals(left)
assert not left.to_frame().equals(right.to_frame())
assert not right.to_frame().equals(left.to_frame())
assert not Index(left, dtype="object").equals(Index(right, dtype="object"))
assert not Index(right, dtype="object").equals(Index(left, dtype="object"))

View File

@ -0,0 +1,175 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
def test_basic():
s = pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd"), name="foo")
result = s.explode()
expected = pd.Series(
[0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object, name="foo"
)
tm.assert_series_equal(result, expected)
def test_mixed_type():
s = pd.Series(
[[0, 1, 2], np.nan, None, np.array([]), pd.Series(["a", "b"])], name="foo"
)
result = s.explode()
expected = pd.Series(
[0, 1, 2, np.nan, None, np.nan, "a", "b"],
index=[0, 0, 0, 1, 2, 3, 4, 4],
dtype=object,
name="foo",
)
tm.assert_series_equal(result, expected)
def test_empty():
s = pd.Series(dtype=object)
result = s.explode()
expected = s.copy()
tm.assert_series_equal(result, expected)
def test_nested_lists():
s = pd.Series([[[1, 2, 3]], [1, 2], 1])
result = s.explode()
expected = pd.Series([[1, 2, 3], 1, 2, 1], index=[0, 1, 1, 2])
tm.assert_series_equal(result, expected)
def test_multi_index():
s = pd.Series(
[[0, 1, 2], np.nan, [], (3, 4)],
name="foo",
index=pd.MultiIndex.from_product([list("ab"), range(2)], names=["foo", "bar"]),
)
result = s.explode()
index = pd.MultiIndex.from_tuples(
[("a", 0), ("a", 0), ("a", 0), ("a", 1), ("b", 0), ("b", 1), ("b", 1)],
names=["foo", "bar"],
)
expected = pd.Series(
[0, 1, 2, np.nan, np.nan, 3, 4], index=index, dtype=object, name="foo"
)
tm.assert_series_equal(result, expected)
def test_large():
s = pd.Series([range(256)]).explode()
result = s.explode()
tm.assert_series_equal(result, s)
def test_invert_array():
df = pd.DataFrame({"a": pd.date_range("20190101", periods=3, tz="UTC")})
listify = df.apply(lambda x: x.array, axis=1)
result = listify.explode()
tm.assert_series_equal(result, df["a"].rename())
@pytest.mark.parametrize(
"s", [pd.Series([1, 2, 3]), pd.Series(pd.date_range("2019", periods=3, tz="UTC"))]
)
def test_non_object_dtype(s):
result = s.explode()
tm.assert_series_equal(result, s)
def test_typical_usecase():
df = pd.DataFrame(
[{"var1": "a,b,c", "var2": 1}, {"var1": "d,e,f", "var2": 2}],
columns=["var1", "var2"],
)
exploded = df.var1.str.split(",").explode()
result = df[["var2"]].join(exploded)
expected = pd.DataFrame(
{"var2": [1, 1, 1, 2, 2, 2], "var1": list("abcdef")},
columns=["var2", "var1"],
index=[0, 0, 0, 1, 1, 1],
)
tm.assert_frame_equal(result, expected)
def test_nested_EA():
# a nested EA array
s = pd.Series(
[
pd.date_range("20170101", periods=3, tz="UTC"),
pd.date_range("20170104", periods=3, tz="UTC"),
]
)
result = s.explode()
expected = pd.Series(
pd.date_range("20170101", periods=6, tz="UTC"), index=[0, 0, 0, 1, 1, 1]
)
tm.assert_series_equal(result, expected)
def test_duplicate_index():
# GH 28005
s = pd.Series([[1, 2], [3, 4]], index=[0, 0])
result = s.explode()
expected = pd.Series([1, 2, 3, 4], index=[0, 0, 0, 0], dtype=object)
tm.assert_series_equal(result, expected)
def test_ignore_index():
# GH 34932
s = pd.Series([[1, 2], [3, 4]])
result = s.explode(ignore_index=True)
expected = pd.Series([1, 2, 3, 4], index=[0, 1, 2, 3], dtype=object)
tm.assert_series_equal(result, expected)
def test_explode_sets():
# https://github.com/pandas-dev/pandas/issues/35614
s = pd.Series([{"a", "b", "c"}], index=[1])
result = s.explode().sort_values()
expected = pd.Series(["a", "b", "c"], index=[1, 1, 1])
tm.assert_series_equal(result, expected)
def test_explode_scalars_can_ignore_index():
# https://github.com/pandas-dev/pandas/issues/40487
s = pd.Series([1, 2, 3], index=["a", "b", "c"])
result = s.explode(ignore_index=True)
expected = pd.Series([1, 2, 3])
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ignore_index", [True, False])
def test_explode_pyarrow_list_type(ignore_index):
# GH 53602
pa = pytest.importorskip("pyarrow")
data = [
[None, None],
[1],
[],
[2, 3],
None,
]
ser = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
result = ser.explode(ignore_index=ignore_index)
expected = pd.Series(
data=[None, None, 1, None, 2, 3, None],
index=None if ignore_index else [0, 0, 1, 2, 3, 3, 4],
dtype=pd.ArrowDtype(pa.int64()),
)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ignore_index", [True, False])
def test_explode_pyarrow_non_list_type(ignore_index):
pa = pytest.importorskip("pyarrow")
data = [1, 2, 3]
ser = pd.Series(data, dtype=pd.ArrowDtype(pa.int64()))
result = ser.explode(ignore_index=ignore_index)
expected = pd.Series([1, 2, 3], dtype="int64[pyarrow]", index=[0, 1, 2])
tm.assert_series_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,38 @@
from pandas import (
Index,
Series,
date_range,
)
import pandas._testing as tm
class TestGetNumericData:
def test_get_numeric_data_preserve_dtype(
self, using_copy_on_write, warn_copy_on_write
):
# get the numeric data
obj = Series([1, 2, 3])
result = obj._get_numeric_data()
tm.assert_series_equal(result, obj)
# returned object is a shallow copy
with tm.assert_cow_warning(warn_copy_on_write):
result.iloc[0] = 0
if using_copy_on_write:
assert obj.iloc[0] == 1
else:
assert obj.iloc[0] == 0
obj = Series([1, "2", 3.0])
result = obj._get_numeric_data()
expected = Series([], dtype=object, index=Index([], dtype=object))
tm.assert_series_equal(result, expected)
obj = Series([True, False, True])
result = obj._get_numeric_data()
tm.assert_series_equal(result, obj)
obj = Series(date_range("20130101", periods=3))
result = obj._get_numeric_data()
expected = Series([], dtype="M8[ns]", index=Index([], dtype=object))
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,8 @@
import pandas._testing as tm
def test_head_tail(string_series):
tm.assert_series_equal(string_series.head(), string_series[:5])
tm.assert_series_equal(string_series.head(0), string_series[0:0])
tm.assert_series_equal(string_series.tail(), string_series[-5:])
tm.assert_series_equal(string_series.tail(0), string_series[0:0])

Some files were not shown because too many files have changed in this diff Show More