-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: to_sql() add parameter "method" to control insertions method (#8… #21401
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
a77cdfd
21e8c04
1e5d1cc
085313c
236e2c0
f4ffbfc
b49792b
59cbdf7
7bdf6f3
44f321a
d5ccabf
643e9bf
6fc6a26
87730f3
f36710b
19f9dfa
c0bf457
19ce379
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2014,7 +2014,7 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs): | |
**kwargs) | ||
|
||
def to_sql(self, name, con, schema=None, if_exists='fail', index=True, | ||
index_label=None, chunksize=None, dtype=None): | ||
index_label=None, chunksize=None, dtype=None, method='default'): | ||
""" | ||
Write records stored in a DataFrame to a SQL database. | ||
|
||
|
@@ -2052,6 +2052,8 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True, | |
Specifying the datatype for columns. The keys should be the column | ||
names and the values should be the SQLAlchemy types or strings for | ||
the sqlite3 legacy mode. | ||
method : {'default', 'multi', callable}, default 'default' | ||
Controls the SQL insertion clause used. | ||
|
||
Raises | ||
------ | ||
|
@@ -2120,11 +2122,59 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True, | |
|
||
>>> engine.execute("SELECT * FROM integers").fetchall() | ||
[(1,), (None,), (2,)] | ||
|
||
Insertion method: | ||
|
||
.. versionadded:: 0.24.0 | ||
|
||
The parameter ``method`` controls the SQL insertion clause used. | ||
Possible values are: | ||
|
||
- `'default'`: Uses standard SQL `INSERT` clause | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to say this is a single insert statement for each row? |
||
- `'multi'`: Pass multiple values in a single `INSERT` clause. | ||
It uses a **special** SQL syntax not supported by all backends. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The 'It' should be indented equally to the |
||
This usually provides a big performance for Analytic databases | ||
like *Presto* and *Redshit*, but has worse performance for | ||
traditional SQL backend if the table contains many columns. | ||
For more information check SQLAlchemy `documention <http://docs.sqlalchemy.org/en/latest/core/dml.html?highlight=multivalues#sqlalchemy.sql.expression.Insert.values.params.*args>`__. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you start the url on the next line, I think flake8 will be OK with the too long line, like:
|
||
- callable: with signature `(pd_table, conn, keys, data_iter)`. | ||
This can be used to implement more performant insertion based on | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here about the indentation |
||
specific backend dialect features. | ||
I.e. using *Postgresql* `COPY clause | ||
<https://www.postgresql.org/docs/current/static/sql-copy.html>`__. | ||
Check API for details and a sample implementation | ||
:func:`~pandas.DataFrame.to_sql`. | ||
|
||
|
||
Example of callable for Postgresql *COPY*:: | ||
|
||
# Alternative to_sql() *method* for DBs that support COPY FROM | ||
import csv | ||
from io import StringIO | ||
|
||
def psql_insert_copy(table, conn, keys, data_iter): | ||
# gets a DBAPI connection that can provide a cursor | ||
dbapi_conn = conn.connection | ||
with dbapi_conn.cursor() as cur: | ||
s_buf = StringIO() | ||
writer = csv.writer(s_buf) | ||
writer.writerows(data_iter) | ||
s_buf.seek(0) | ||
|
||
columns = ', '.join('"{}"'.format(k) for k in keys) | ||
if table.schema: | ||
table_name = '{}.{}'.format(table.schema, table.name) | ||
else: | ||
table_name = table.name | ||
|
||
sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format( | ||
table_name, columns) | ||
cur.copy_expert(sql=sql, file=s_buf) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice explanation here! Do you mind moving some of it to the parameter description? That's probably where viewers will look first. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, indeed (similar to my comment above), I would at least put a basic explanation of each option in the parameter description. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And maybe we can put this example in the user guide (io.rst) ? |
||
""" | ||
from pandas.io import sql | ||
sql.to_sql(self, name, con, schema=schema, if_exists=if_exists, | ||
index=index, index_label=index_label, chunksize=chunksize, | ||
dtype=dtype) | ||
dtype=dtype, method=method) | ||
|
||
def to_pickle(self, path, compression='infer', | ||
protocol=pkl.HIGHEST_PROTOCOL): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
|
||
from __future__ import print_function, division | ||
from datetime import datetime, date, time | ||
from functools import partial | ||
|
||
import warnings | ||
import re | ||
|
@@ -398,7 +399,7 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None, | |
|
||
|
||
def to_sql(frame, name, con, schema=None, if_exists='fail', index=True, | ||
index_label=None, chunksize=None, dtype=None): | ||
index_label=None, chunksize=None, dtype=None, method='default'): | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
Write records stored in a DataFrame to a SQL database. | ||
|
||
|
@@ -432,6 +433,8 @@ def to_sql(frame, name, con, schema=None, if_exists='fail', index=True, | |
Optional specifying the datatype for columns. The SQL type should | ||
be a SQLAlchemy type, or a string for sqlite3 fallback connection. | ||
If all columns are of the same type, one single value can be used. | ||
method : {'default', 'multi', callable}, default 'default' | ||
Controls the SQL insertion clause used. | ||
|
||
""" | ||
if if_exists not in ('fail', 'replace', 'append'): | ||
|
@@ -447,7 +450,7 @@ def to_sql(frame, name, con, schema=None, if_exists='fail', index=True, | |
|
||
pandas_sql.to_sql(frame, name, if_exists=if_exists, index=index, | ||
index_label=index_label, schema=schema, | ||
chunksize=chunksize, dtype=dtype) | ||
chunksize=chunksize, dtype=dtype, method=method) | ||
|
||
|
||
def has_table(table_name, con, schema=None): | ||
|
@@ -572,8 +575,29 @@ def create(self): | |
else: | ||
self._execute_create() | ||
|
||
def insert_statement(self): | ||
return self.table.insert() | ||
def _execute_insert(self, conn, keys, data_iter): | ||
"""Execute SQL statement inserting data | ||
|
||
Parameters | ||
---------- | ||
conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection | ||
keys : list of str | ||
Column names | ||
data_iter : generator of list | ||
Each item contains a list of values to be inserted | ||
""" | ||
data = [{k: v for k, v in zip(keys, row)} for row in data_iter] | ||
conn.execute(self.table.insert(), data) | ||
|
||
def _execute_insert_multi(self, conn, keys, data_iter): | ||
"""Alternative to _execute_insert for DBs support multivalue INSERT. | ||
|
||
Note: multi-value insert is usually faster for analytics DBs | ||
and tables containing a few columns | ||
but performance degrades quickly with increase of columns. | ||
""" | ||
data = [{k: v for k, v in zip(keys, row)} for row in data_iter] | ||
conn.execute(self.table.insert(data)) | ||
|
||
def insert_data(self): | ||
if self.index is not None: | ||
|
@@ -611,11 +635,18 @@ def insert_data(self): | |
|
||
return column_names, data_list | ||
|
||
def _execute_insert(self, conn, keys, data_iter): | ||
data = [{k: v for k, v in zip(keys, row)} for row in data_iter] | ||
conn.execute(self.insert_statement(), data) | ||
def insert(self, chunksize=None, method=None): | ||
|
||
# set insert method | ||
if method in (None, 'default'): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is the 'None' needed here? |
||
exec_insert = self._execute_insert | ||
elif method == 'multi': | ||
exec_insert = self._execute_insert_multi | ||
elif callable(method): | ||
exec_insert = partial(method, self) | ||
else: | ||
raise ValueError('Invalid parameter `method`: {}'.format(method)) | ||
|
||
def insert(self, chunksize=None): | ||
keys, data_list = self.insert_data() | ||
|
||
nrows = len(self.frame) | ||
|
@@ -638,7 +669,7 @@ def insert(self, chunksize=None): | |
break | ||
|
||
chunk_iter = zip(*[arr[start_i:end_i] for arr in data_list]) | ||
self._execute_insert(conn, keys, chunk_iter) | ||
exec_insert(conn, keys, chunk_iter) | ||
|
||
def _query_iterator(self, result, chunksize, columns, coerce_float=True, | ||
parse_dates=None): | ||
|
@@ -1078,7 +1109,8 @@ def read_query(self, sql, index_col=None, coerce_float=True, | |
read_sql = read_query | ||
|
||
def to_sql(self, frame, name, if_exists='fail', index=True, | ||
index_label=None, schema=None, chunksize=None, dtype=None): | ||
index_label=None, schema=None, chunksize=None, dtype=None, | ||
method='default'): | ||
""" | ||
Write records stored in a DataFrame to a SQL database. | ||
|
||
|
@@ -1108,7 +1140,8 @@ def to_sql(self, frame, name, if_exists='fail', index=True, | |
Optional specifying the datatype for columns. The SQL type should | ||
be a SQLAlchemy type. If all columns are of the same type, one | ||
single value can be used. | ||
|
||
method : {'default', 'multi', callable}, default 'default' | ||
Controls the SQL insertion clause used. | ||
""" | ||
if dtype and not is_dict_like(dtype): | ||
dtype = {col_name: dtype for col_name in frame} | ||
|
@@ -1124,7 +1157,7 @@ def to_sql(self, frame, name, if_exists='fail', index=True, | |
if_exists=if_exists, index_label=index_label, | ||
schema=schema, dtype=dtype) | ||
table.create() | ||
table.insert(chunksize) | ||
table.insert(chunksize, method=method) | ||
if (not name.isdigit() and not name.islower()): | ||
# check for potentially case sensitivity issues (GH7815) | ||
# Only check when name is not a number and name is not lower case | ||
|
@@ -1434,7 +1467,8 @@ def _fetchall_as_list(self, cur): | |
return result | ||
|
||
def to_sql(self, frame, name, if_exists='fail', index=True, | ||
index_label=None, schema=None, chunksize=None, dtype=None): | ||
index_label=None, schema=None, chunksize=None, dtype=None, | ||
method='default'): | ||
""" | ||
Write records stored in a DataFrame to a SQL database. | ||
|
||
|
@@ -1463,7 +1497,8 @@ def to_sql(self, frame, name, if_exists='fail', index=True, | |
Optional specifying the datatype for columns. The SQL type should | ||
be a string. If all columns are of the same type, one single value | ||
can be used. | ||
|
||
method : {'default', 'multi', callable}, default 'default' | ||
Controls the SQL insertion clause used. | ||
""" | ||
if dtype and not is_dict_like(dtype): | ||
dtype = {col_name: dtype for col_name in frame} | ||
|
@@ -1478,7 +1513,7 @@ def to_sql(self, frame, name, if_exists='fail', index=True, | |
if_exists=if_exists, index_label=index_label, | ||
dtype=dtype) | ||
table.create() | ||
table.insert(chunksize) | ||
table.insert(chunksize, method) | ||
|
||
def has_table(self, name, schema=None): | ||
# TODO(wesm): unused? | ||
|
Uh oh!
There was an error while loading. Please reload this page.