Skip to content

Commit fcd11b5

Browse files
committed
CLN: str.cat internals
1 parent 8a1c8ad commit fcd11b5

File tree

2 files changed

+53
-161
lines changed

2 files changed

+53
-161
lines changed

pandas/core/strings.py

Lines changed: 52 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from pandas.compat import zip
55
from pandas.core.dtypes.generic import ABCSeries, ABCIndex
6-
from pandas.core.dtypes.missing import isna, notna
6+
from pandas.core.dtypes.missing import isna
77
from pandas.core.dtypes.common import (
88
is_bool_dtype,
99
is_categorical_dtype,
@@ -36,114 +36,28 @@
3636
_shared_docs = dict()
3737

3838

39-
def _get_array_list(arr, others):
40-
"""
41-
Auxiliary function for :func:`str_cat`
42-
43-
Parameters
44-
----------
45-
arr : ndarray
46-
The left-most ndarray of the concatenation
47-
others : list, ndarray, Series
48-
The rest of the content to concatenate. If list of list-likes,
49-
all elements must be passable to ``np.asarray``.
50-
51-
Returns
52-
-------
53-
list
54-
List of all necessary arrays
55-
"""
56-
from pandas.core.series import Series
57-
58-
if len(others) and isinstance(com.values_from_object(others)[0],
59-
(list, np.ndarray, Series)):
60-
arrays = [arr] + list(others)
61-
else:
62-
arrays = [arr, others]
63-
64-
return [np.asarray(x, dtype=object) for x in arrays]
65-
66-
67-
def str_cat(arr, others=None, sep=None, na_rep=None):
68-
"""
39+
def interleave_sep(all_cols, sep):
40+
'''
6941
Auxiliary function for :meth:`str.cat`
7042
71-
If `others` is specified, this function concatenates the Series/Index
72-
and elements of `others` element-wise.
73-
If `others` is not being passed then all values in the Series are
74-
concatenated in a single string with a given `sep`.
75-
7643
Parameters
7744
----------
78-
others : list-like, or list of list-likes, optional
79-
List-likes (or a list of them) of the same length as calling object.
80-
If None, returns str concatenating strings of the Series.
81-
sep : string or None, default None
82-
If None, concatenates without any separator.
83-
na_rep : string or None, default None
84-
If None, NA in the series are ignored.
45+
all_cols : list of numpy arrays
46+
List of arrays to be concatenated with sep
47+
sep : string
48+
The separator string for concatenating the columns
8549
8650
Returns
8751
-------
88-
concat
89-
ndarray containing concatenated results (if `others is not None`)
90-
or str (if `others is None`)
91-
"""
92-
if sep is None:
93-
sep = ''
94-
95-
if others is not None:
96-
arrays = _get_array_list(arr, others)
97-
98-
n = _length_check(arrays)
99-
masks = np.array([isna(x) for x in arrays])
100-
cats = None
101-
102-
if na_rep is None:
103-
na_mask = np.logical_or.reduce(masks, axis=0)
104-
105-
result = np.empty(n, dtype=object)
106-
np.putmask(result, na_mask, np.nan)
107-
108-
notmask = ~na_mask
109-
110-
tuples = zip(*[x[notmask] for x in arrays])
111-
cats = [sep.join(tup) for tup in tuples]
112-
113-
result[notmask] = cats
114-
else:
115-
for i, x in enumerate(arrays):
116-
x = np.where(masks[i], na_rep, x)
117-
if cats is None:
118-
cats = x
119-
else:
120-
cats = cats + sep + x
121-
122-
result = cats
123-
124-
return result
125-
else:
126-
arr = np.asarray(arr, dtype=object)
127-
mask = isna(arr)
128-
if na_rep is None and mask.any():
129-
if sep == '':
130-
na_rep = ''
131-
else:
132-
return sep.join(arr[notna(arr)])
133-
return sep.join(np.where(mask, na_rep, arr))
134-
135-
136-
def _length_check(others):
137-
n = None
138-
for x in others:
139-
try:
140-
if n is None:
141-
n = len(x)
142-
elif len(x) != n:
143-
raise ValueError('All arrays must be same length')
144-
except TypeError:
145-
raise ValueError('Must pass arrays containing strings to str_cat')
146-
return n
52+
list
53+
The list of arrays interleaved with sep; to be fed to np.sum
54+
'''
55+
if sep == '':
56+
# no need to add empty strings
57+
return all_cols
58+
result = [sep] * (2 * len(all_cols) - 1)
59+
result[::2] = all_cols
60+
return result
14761

14862

14963
def _na_map(f, arr, na_result=np.nan, dtype=object):
@@ -2172,6 +2086,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
21722086

21732087
if isinstance(others, compat.string_types):
21742088
raise ValueError("Did you mean to supply a `sep` keyword?")
2089+
if sep is None:
2090+
sep = ''
21752091

21762092
if isinstance(self._orig, Index):
21772093
data = Series(self._orig, index=self._orig)
@@ -2180,9 +2096,13 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
21802096

21812097
# concatenate Series/Index with itself if no "others"
21822098
if others is None:
2183-
result = str_cat(data, others=others, sep=sep, na_rep=na_rep)
2184-
return self._wrap_result(result,
2185-
use_codes=(not self._is_categorical))
2099+
data = data.astype(object).values
2100+
mask = isna(data)
2101+
if mask.any():
2102+
if na_rep is None:
2103+
return sep.join(data[~mask])
2104+
return sep.join(np.where(mask, na_rep, data))
2105+
return sep.join(data)
21862106

21872107
try:
21882108
# turn anything in "others" into lists of Series
@@ -2209,23 +2129,42 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
22092129
"'outer'|'inner'|'right'`. The future default will "
22102130
"be `join='left'`.", FutureWarning, stacklevel=2)
22112131

2212-
# align if required
2213-
if join is not None:
2132+
# if join is None, _get_series_list already aligned indexes
2133+
join = 'left' if join is None else join
2134+
2135+
if any(not data.index.equals(x.index) for x in others):
22142136
# Need to add keys for uniqueness in case of duplicate columns
22152137
others = concat(others, axis=1,
22162138
join=(join if join == 'inner' else 'outer'),
2217-
keys=range(len(others)))
2139+
keys=range(len(others)), copy=False)
22182140
data, others = data.align(others, join=join)
22192141
others = [others[x] for x in others] # again list of Series
22202142

2221-
# str_cat discards index
2222-
res = str_cat(data, others=others, sep=sep, na_rep=na_rep)
2143+
all_cols = [x.astype(object).values for x in [data] + others]
2144+
masks = np.array([isna(x) for x in all_cols])
2145+
union_mask = np.logical_or.reduce(masks, axis=0)
2146+
2147+
if na_rep is None and union_mask.any():
2148+
result = np.empty(len(data), dtype=object)
2149+
np.putmask(result, union_mask, np.nan)
2150+
2151+
not_masked = ~union_mask
2152+
all_cols = interleave_sep([x[not_masked] for x in all_cols], sep)
2153+
2154+
result[not_masked] = np.sum(all_cols, axis=0)
2155+
elif na_rep is not None and union_mask.any():
2156+
# fill NaNs
2157+
all_cols = [np.where(masks[i], na_rep, all_cols[i])
2158+
for i in range(len(all_cols))]
2159+
result = np.sum(interleave_sep(all_cols, sep), axis=0)
2160+
else: # no NaNs
2161+
result = np.sum(interleave_sep(all_cols, sep), axis=0)
22232162

22242163
if isinstance(self._orig, Index):
2225-
res = Index(res, name=self._orig.name)
2164+
result = Index(result, name=self._orig.name)
22262165
else: # Series
2227-
res = Series(res, index=data.index, name=self._orig.name)
2228-
return res
2166+
result = Series(result, index=data.index, name=self._orig.name)
2167+
return result
22292168

22302169
_shared_docs['str_split'] = ("""
22312170
Split strings around given separator/delimiter.

pandas/tests/test_strings.py

Lines changed: 1 addition & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -97,53 +97,6 @@ def test_iter_object_try_string(self):
9797
assert i == 100
9898
assert s == 'h'
9999

100-
def test_cat(self):
101-
one = np.array(['a', 'a', 'b', 'b', 'c', NA], dtype=np.object_)
102-
two = np.array(['a', NA, 'b', 'd', 'foo', NA], dtype=np.object_)
103-
104-
# single array
105-
result = strings.str_cat(one)
106-
exp = 'aabbc'
107-
assert result == exp
108-
109-
result = strings.str_cat(one, na_rep='NA')
110-
exp = 'aabbcNA'
111-
assert result == exp
112-
113-
result = strings.str_cat(one, na_rep='-')
114-
exp = 'aabbc-'
115-
assert result == exp
116-
117-
result = strings.str_cat(one, sep='_', na_rep='NA')
118-
exp = 'a_a_b_b_c_NA'
119-
assert result == exp
120-
121-
result = strings.str_cat(two, sep='-')
122-
exp = 'a-b-d-foo'
123-
assert result == exp
124-
125-
# Multiple arrays
126-
result = strings.str_cat(one, [two], na_rep='NA')
127-
exp = np.array(['aa', 'aNA', 'bb', 'bd', 'cfoo', 'NANA'],
128-
dtype=np.object_)
129-
tm.assert_numpy_array_equal(result, exp)
130-
131-
result = strings.str_cat(one, two)
132-
exp = np.array(['aa', NA, 'bb', 'bd', 'cfoo', NA], dtype=np.object_)
133-
tm.assert_almost_equal(result, exp)
134-
135-
# error for incorrect lengths
136-
rgx = 'All arrays must be same length'
137-
three = Series(['1', '2', '3'])
138-
139-
with tm.assert_raises_regex(ValueError, rgx):
140-
strings.str_cat(one, three)
141-
142-
# error for incorrect type
143-
rgx = "Must pass arrays containing strings to str_cat"
144-
with tm.assert_raises_regex(ValueError, rgx):
145-
strings.str_cat(one, 'three')
146-
147100
@pytest.mark.parametrize('box', [Series, Index])
148101
@pytest.mark.parametrize('other', [None, Series, Index])
149102
def test_str_cat_name(self, box, other):
@@ -3136,7 +3089,7 @@ def test_method_on_bytes(self):
31363089
lhs = Series(np.array(list('abc'), 'S1').astype(object))
31373090
rhs = Series(np.array(list('def'), 'S1').astype(object))
31383091
if compat.PY3:
3139-
pytest.raises(TypeError, lhs.str.cat, rhs)
3092+
pytest.raises(TypeError, lhs.str.cat, rhs, sep=',')
31403093
else:
31413094
result = lhs.str.cat(rhs)
31423095
expected = Series(np.array(

0 commit comments

Comments
 (0)