Description
Hi,
I want to convert to datetime64 a Series that contains datetimes as strings. The format is '%Y-%m-%d %H:%M:%S' ('2012-07-06 10:05:58', for instance).
Casting the strings array into a datetime64 array in numpy (or using Series.astype("datetime64")) is fast, but it transforms the datetimes according to the local timezone, which is not the behavior I want in this case. Pandas to_datetime function does the parsing right, but it is much slower.
However, it is also possible to do the parsing right and fast with numpy by appending the "+0000" timezone suffix to every string before parsing/casting to datetime64. So I wonder, is there any reason why to_datetime() runs much slower than this approach?
Thanks and regards.
Some sample code to illustrate the issue:
import os
import time
import numpy as np
from datetime import datetime, timedelta
from pandas import DatetimeIndex, Series, to_datetime
from pandas.tseries.offsets import Minute
DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
# Decorator to execute code in a faked GMT timezone
def GMT_Timezone(func):
def wrapper(*args, **kwargs):
# Set timezone to GMT
previous_tz = time.tzname[0]
os.environ['TZ'] = 'GMT'
time.tzset()
# Execute function
result = func(*args, **kwargs)
# Set timezone to previous state
os.environ['TZ'] = previous_tz
time.tzset()
return result
return wrapper
def generate_test_data():
start_date = datetime(2012, 1, 1)
end_date = datetime(2012, 1, 31)
minutes = DatetimeIndex(start=start_date, end=end_date, freq=Minute())
str_minutes = Series(minutes.map(lambda d: d.strftime(DATETIME_FORMAT)))
return str_minutes
def simple_casting(data):
return data.astype("datetime64")
def shifting_afterwards(data):
casted = data.astype("datetime64")
shifted = DatetimeIndex(casted) - timedelta(seconds=time.altzone)
return Series(shifted)
def concat_gmt_tz(data):
data = np.add(data, "+0000")
return data.astype("datetime64")
def using_to_datetime(data):
return to_datetime(data)
@GMT_Timezone
def faking_tz(data):
return data.astype("datetime64")
if __name__ == '__main__':
test_data = generate_test_data()
# Some aproaches to parse the datetime string
using_to_dt = using_to_datetime(test_data)
simple = simple_casting(test_data)
shifted = shifting_afterwards(test_data)
faked_tz = faking_tz(test_data)
concat_tz = concat_gmt_tz(test_data)
# Simple casting does not work. During the process, numpy converts the date
# into GMT according to the local timezone.
assert not (simple == using_to_dt).all(), "Fails if run from GMT timezone."
# Shifting after casting does not work either, the problem being that
# we should shift using a different timezone depending on the date we are
# parsing (summer vs winter) -> more complex + error-prone + slower
assert not (shifted == using_to_dt).all()
# Faking the timezone to be GMT avoids datetime transformations, so it seems
# to work, although since we are messing with environmental variables
# it gets risky (multithreading, other side-effects?)
assert (faked_tz == using_to_dt).all()
assert (concat_tz == using_to_dt).all()
# Now to performance
from timeit import Timer
ITERATIONS = 5
t = Timer("using_to_datetime(test_data)",
"from __main__ import generate_test_data, using_to_datetime;" + \
"test_data = generate_test_data()")
t_using_to_dt = t.timeit(ITERATIONS) / ITERATIONS
t = Timer("faking_tz(test_data)",
"from __main__ import generate_test_data, faking_tz;" + \
"test_data = generate_test_data()")
t_faking_tz = t.timeit(ITERATIONS) / ITERATIONS
t = Timer("concat_gmt_tz(test_data)",
"from __main__ import generate_test_data, concat_gmt_tz;" + \
"test_data = generate_test_data()")
t_concat_tz = t.timeit(ITERATIONS) / ITERATIONS
print "to_datetime():", t_using_to_dt
print "faking tz:", t_faking_tz
# to_datetime() ~90 times slower than .astype("datetime64")
print "Ratio:", t_using_to_dt / t_faking_tz
print
print "to_datetime():", t_using_to_dt
print "concat tz:", t_concat_tz
# to_datetime() ~270 times slower than concat tz + .astype("datetime64")
print "Ratio:", t_using_to_dt / t_concat_tz