Description
Currently, Pandas IO tools for binary files support largely the commercial statistical packages (SAS, Stata, SPSS). Interestingly, R binary types (.rds, .rda) are not included. Since many data science teams work between the open source stacks, some IO pandas support of R data files may be worthwhile to pursue.
I know there is some history of pandas with rpy2. However, there may be a way to integrate an IO module for R data files without optional dependency (i.e, pyreadr) but using a lightweight C library: librdata. Also, R's saveRDS
uses compression types (gzip
, bzip2
, and xz
) already handled with pandas io.
Thanks to the authors of pyreadr and librdata (not unlike the sas7bdat
authors for read_sas
or PyDTA
authors for read_stata
), I was able to implement a demo on an uncompressed rds type.
R
set.seed(332020)
alpha <- c(LETTERS, letters, c(0:9))
data_tools <- c("sas", "stata", "spss", "python", "r", "julia")
random_df <- data.frame(
group = factor(sample(data_tools, 500, replace=TRUE)),
int = sample(1:15, 500, replace=TRUE),
num = rnorm(500),
char = replicate(500, paste(sample(alpha, 3, replace=TRUE), collapse="")),
bool = sample(c(TRUE, FALSE), 500, replace=TRUE),
date = sample(seq(as.Date("2001-01-01"), Sys.Date(), by="days"), 500, replace=TRUE),
ts= as.POSIXct(sample(1577836800:as.integer(Sys.time()), 500, replace=TRUE), origin="1970-01-01")
)
head(random_df, 5)
# group int num char bool date ts
# 1 r 15 1.2012638 3kp TRUE 2006-12-07 2020-07-04 21:06:23
# 2 spss 5 0.9627570 rf6 TRUE 2010-10-28 2020-03-22 02:04:45
# 3 julia 9 -0.7922929 t9Q FALSE 2003-01-02 2021-02-11 06:33:02
# 4 julia 14 -0.4305794 zWw TRUE 2004-11-24 2020-01-25 11:24:20
# 5 python 5 -1.0262956 LqK TRUE 2020-08-20 2021-01-15 23:22:05
tail(random_df, 5)
# group int num char bool date ts
# 496 python 2 1.1038483 xGh TRUE 2014-01-14 2020-11-30 11:30:00
# 497 sas 1 -0.5588906 BbC TRUE 2011-03-15 2020-11-03 06:30:08
# 498 sas 1 0.3989181 FHi TRUE 2003-03-02 2020-01-16 08:23:56
# 499 python 1 -0.7840641 e3k FALSE 2001-10-23 2020-05-27 06:53:21
# 500 spss 7 0.2351526 Klv TRUE 2002-05-31 2020-05-28 11:42:56
### RDS
saveRDS(random_df, "/path/to/r_df.rds", compress=FALSE)
### RDA
save(random_df, "/path/to/r_df.rda", compress=FALSE)
Python (using a Cython built module)
Parser
from rparser import Parser, Writer # slight adjustments to pyreadr's pyx w/o 3rd party imports
import pandas as pd
from datetime import datetime as dt, timedelta
class BaseRParser(Parser):
"""
Parses the RData or Rds file using the parser defined
in librdata.pyx which in turn uses the C API of librdata.
"""
def __init__(self):
self.counter = 0
self.is_factor = False
self.col_names = {}
self.row_names = {}
self.col_types = {}
self.col_data = {}
self.text_vals = {}
self.value_labels = {}
def handle_table(self, name):
"""
Every object in the file is called table, this method is evoked once per object.
:param name: str: the name of the table
"""
pass
def handle_column(self, name, data_type, data, count):
"""
Evoked once per each column in the table.
:param name: str: column name, may be None
:param data_type: object of type DataType(Enum) (defined in librdata.pyx)
:param data: a dictionary containing the data in R vector, may be empty
:param count: int: number of elements in the array
"""
self.row_count = count
if self.is_factor:
data = {k:self.value_labels[v] for k,v in data.items()}
self.is_factor = False
if data_type == "bool":
data = {k:True if v==1 else False for k,v in data.items()}
if data_type == "date":
data = {k:dt(1970,1,1,0,0) + timedelta(v) for k,v in data.items()}
if data_type == "datetime":
data = {k:dt.fromtimestamp(v) for k,v in data.items()}
self.col_data = {**self.col_data, **{self.counter: data}}
self.col_types[self.counter] = data_type
self.counter += 1
def handle_column_name(self, name, index):
"""
Some times name is None in handle column but it is recovered with this method.
:param name: str: name of the column
:param index: int: index of the column
"""
self.col_names[index] = name
if index == (self.counter - 1):
self.compile_dataframe()
def handle_dim(self, name, data_type, data, count):
"""
Evoked once to retrieve the number of dimensions
:param name: str: column name, may be None
:param data_type: object of type DataType(Enum) (defined in librdata.pyx)
:param data: a numpy array representing the number of dimensions
:param count: int: number of elements in the array
"""
pass
def handle_dim_name(self, name, index):
"""
Get one dimension name, one at a time, for matrices, arrays, tables.
:param name: str: name of the dimension
:param index: int: index of the dimension
"""
pass
def handle_row_name(self, name, index):
"""
Handles R dataframe's rownames
:param name: str: name of the row
:param index: int: index of the row
"""
self.row_names[index] = name
def handle_text_value(self, name, index):
"""
For character vectors this will be called once per row and will
retrieve the string value for that row.
:param name: str: string value for the row
:param index: int: index of the row
:return:
"""
self.text_vals[index] = name
if index == (self.row_count - 1):
self.col_data = {**self.col_data, **{self.counter-1: self.text_vals}}
self.text_vals = {}
def handle_value_label(self, name, index):
"""
Factors are represented as integer vectors.
For factors, this method is called before reading the integer data
in the Factor column with handle_column and will give all the
string values corresponding to the integer values.
:param name: str: string value
:param index: int: integer value
:return:
"""
self.is_factor = True
self.value_labels[index+1] = name
def compile_dataframe(self):
df_data = {n:v for (k,v),n in zip(self.col_data.items(), list(self.col_names.values()))}
self.r_dataframe = pd.DataFrame(df_data)
parser = BaseRParser()
parser.read_rds("/path/to/r_df.rds")
parser.read_rds("/path/to/r_df.rda")
py_df = parser.r_dataframe
print(py_df)
# group int num char bool date ts
# 0 r 15 1.201264 3kp True 2006-12-07 2020-07-04 21:06:23
# 1 spss 5 0.962757 rf6 True 2010-10-28 2020-03-22 02:04:45
# 2 julia 9 -0.792293 t9Q False 2003-01-02 2021-02-11 06:33:02
# 3 julia 14 -0.430579 zWw True 2004-11-24 2020-01-25 11:24:20
# 4 python 5 -1.026296 LqK True 2020-08-20 2021-01-15 23:22:05
# .. ... ... ... ... ... ... ...
# 495 python 2 1.103848 xGh True 2014-01-14 2020-11-30 11:30:00
# 496 sas 1 -0.558891 BbC True 2011-03-15 2020-11-03 06:30:08
# 497 sas 1 0.398918 FHi True 2003-03-02 2020-01-16 08:23:56
# 498 python 1 -0.784064 e3k False 2001-10-23 2020-05-27 06:53:21
# 499 spss 7 0.235153 Klv True 2002-05-31 2020-05-28 11:42:56
# [500 rows x 7 columns]
Writer
class BaseRWriter(Writer):
"""
Writes pandas data to the RData or Rds file using the writer defined
in librdata.pyx which in turn uses the C API of librdata.
"""
def write_rds(self, path=None, file_format="rds", frame=None, frame_name=None):
"""
write a RData or Rds file.
path: str: path to the file
file_format: str: rdata or rds
df: pandas data frame
df_name = name of the object to write. Irrelevant if rds format.
"""
py_to_r_types = {
"int64": "INTEGER",
"float64": "REAL",
"bool": "LOGICAL",
"object": "CHARACTER",
"datetime64[ns]": "TIMESTAMP"
}
self.open(path, file_format)
self.set_row_count(frame.shape[0])
self.set_table_name(frame_name)
for col_name, col_type in zip(frame.columns.tolist(), frame.dtypes.tolist()):
cur_type = py_to_r_types[str(col_type)]
self.add_column(str(col_name), str(cur_type))
for col in frame.select_dtypes(include=['datetime']).columns:
frame[col] = frame[col].dt.tz_localize("America/Chicago").astype(int).div(10**9)
for col_indx,((col,ser),dtype) in enumerate(zip(frame.to_dict().items(), frame.dtypes.tolist())):
cur_type = py_to_r_types[str(dtype)]
for row_indx, (indx, val) in enumerate(ser.items()):
self.insert_value(row_indx, col_indx, val, cur_type)
self.close()
writer = BaseRWriter()
writer.write_rds(
path="/path/to/py_df.rds",
file_format="rds",
frame=py_df,
frame_name="pandas_df"
)
R
py_df <- readRDS("/path/to//py_df.rds")
str(py_df)
# 'data.frame': 500 obs. of 7 variables:
# $ group: chr "r" "spss" "julia" "julia" ...
# $ int : int 15 5 9 14 5 9 7 7 5 10 ...
# $ num : num 1.201 0.963 -0.792 -0.431 -1.026 ...
# $ char : chr "3kp" "rf6" "t9Q" "zWw" ...
# $ bool : logi TRUE TRUE FALSE TRUE TRUE TRUE ...
# $ date : 'POSIXct' num 2006-12-07 2010-10-28 2003-01-02 2004-11-24 2020-08-20 ...
# $ ts : 'POSIXct' num 2020-07-04 21:06:23 2020-03-22 02:04:45 2021-02-11 06:33:02 2020-01-25 11:24:20 2021-01-15 23:22:05 ...
# - attr(*, "datalabel")= chr "pandas_df"
# - attr(*, "var.labels")= chr [1:7] "" "" "" "" ...
head(py_df, 5)
# group int num char bool date ts
# 1 r 15 1.2012638 3kp TRUE 2006-12-07 2020-07-04 21:06:23
# 2 spss 5 0.9627570 rf6 TRUE 2010-10-28 2020-03-22 02:04:45
# 3 julia 9 -0.7922929 t9Q FALSE 2003-01-02 2021-02-11 06:33:02
# 4 julia 14 -0.4305794 zWw TRUE 2004-11-24 2020-01-25 11:24:20
# 5 python 5 -1.0262956 LqK TRUE 2020-08-20 2021-01-15 23:22:05
tail(py_df, 5)
# group int num char bool date ts
# 496 python 2 1.1038483 xGh TRUE 2014-01-14 2020-11-30 11:30:00
# 497 sas 1 -0.5588906 BbC TRUE 2011-03-15 2020-11-03 06:30:08
# 498 sas 1 0.3989181 FHi TRUE 2003-03-02 2020-01-16 08:23:56
# 499 python 1 -0.7840641 e3k FALSE 2001-10-23 2020-05-27 06:53:21
# 500 spss 7 0.2351526 Klv TRUE 2002-05-31 2020-05-28 11:42:56