Remove old read_gbq, rename from_gbq to read_gbq, and add verbose query job info

Jason Ng · Jason Ng · commit 2f93f7b537a7 · 2017-04-09T10:00:37.000-04:00
Remove locale import
diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py
@@ -304,28 +304,6 @@ def _print(self, msg, end='\n'):
             sys.stdout.write(msg + end)
             sys.stdout.flush()
 
-    def _start_timer(self):
-        self.start = time.time()
-
-    def get_elapsed_seconds(self):
-        return round(time.time() - self.start, 2)
-
-    def print_elapsed_seconds(self, prefix='Elapsed', postfix='s.',
-                              overlong=7):
-        sec = self.get_elapsed_seconds()
-        if sec > overlong:
-            self._print('{} {} {}'.format(prefix, sec, postfix))
-
-    # http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
-    @staticmethod
-    def sizeof_fmt(num, suffix='B'):
-        fmt = "%3.1f %s%s"
-        for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
-            if abs(num) < 1024.0:
-                return fmt % (num, unit, suffix)
-            num /= 1024.0
-        return fmt % (num, 'Y', suffix)
-
     def get_service(self):
         import httplib2
         try:
@@ -379,132 +357,6 @@ def process_insert_errors(self, insert_errors):
 
         raise StreamingInsertError
 
-    def run_query(self, query, **kwargs):
-        try:
-            from googleapiclient.errors import HttpError
-        except:
-            from apiclient.errors import HttpError
-        from oauth2client.client import AccessTokenRefreshError
-
-        _check_google_client_version()
-
-        job_collection = self.service.jobs()
-
-        job_config = {
-            'query': {
-                'query': query,
-                'useLegacySql': self.dialect == 'legacy'
-                # 'allowLargeResults', 'createDisposition',
-                # 'preserveNulls', destinationTable, useQueryCache
-            }
-        }
-        config = kwargs.get('configuration')
-        if config is not None:
-            if len(config) != 1:
-                raise ValueError("Only one job type must be specified, but "
-                                 "given {}".format(','.join(config.keys())))
-            if 'query' in config:
-                if 'query' in config['query'] and query is not None:
-                    raise ValueError("Query statement can't be specified "
-                                     "inside config while it is specified "
-                                     "as parameter")
-
-                job_config['query'].update(config['query'])
-            else:
-                raise ValueError("Only 'query' job type is supported")
-
-        job_data = {
-            'configuration': job_config
-        }
-
-        self._start_timer()
-        try:
-            self._print('Requesting query... ', end="")
-            query_reply = job_collection.insert(
-                projectId=self.project_id, body=job_data).execute()
-            self._print('ok.\nQuery running...')
-        except (AccessTokenRefreshError, ValueError):
-            if self.private_key:
-                raise AccessDenied(
-                    "The service account credentials are not valid")
-            else:
-                raise AccessDenied(
-                    "The credentials have been revoked or expired, "
-                    "please re-run the application to re-authorize")
-        except HttpError as ex:
-            self.process_http_error(ex)
-
-        job_reference = query_reply['jobReference']
-
-        while not query_reply.get('jobComplete', False):
-            self.print_elapsed_seconds('  Elapsed', 's. Waiting...')
-            try:
-                query_reply = job_collection.getQueryResults(
-                    projectId=job_reference['projectId'],
-                    jobId=job_reference['jobId']).execute()
-            except HttpError as ex:
-                self.process_http_error(ex)
-
-        if self.verbose:
-            if query_reply['cacheHit']:
-                self._print('Query done.\nCache hit.\n')
-            else:
-                bytes_processed = int(query_reply.get(
-                    'totalBytesProcessed', '0'))
-                self._print('Query done.\nProcessed: {}\n'.format(
-                    self.sizeof_fmt(bytes_processed)))
-
-            self._print('Retrieving results...')
-
-        total_rows = int(query_reply['totalRows'])
-        result_pages = list()
-        seen_page_tokens = list()
-        current_row = 0
-        # Only read schema on first page
-        schema = query_reply['schema']
-
-        # Loop through each page of data
-        while 'rows' in query_reply and current_row < total_rows:
-            page = query_reply['rows']
-            result_pages.append(page)
-            current_row += len(page)
-
-            self.print_elapsed_seconds(
-                '  Got page: {}; {}% done. Elapsed'.format(
-                    len(result_pages),
-                    round(100.0 * current_row / total_rows)))
-
-            if current_row == total_rows:
-                break
-
-            page_token = query_reply.get('pageToken', None)
-
-            if not page_token and current_row < total_rows:
-                raise InvalidPageToken("Required pageToken was missing. "
-                                       "Received {0} of {1} rows"
-                                       .format(current_row, total_rows))
-
-            elif page_token in seen_page_tokens:
-                raise InvalidPageToken("A duplicate pageToken was returned")
-
-            seen_page_tokens.append(page_token)
-
-            try:
-                query_reply = job_collection.getQueryResults(
-                    projectId=job_reference['projectId'],
-                    jobId=job_reference['jobId'],
-                    pageToken=page_token).execute()
-            except HttpError as ex:
-                self.process_http_error(ex)
-
-        if current_row < total_rows:
-            raise InvalidPageToken()
-
-        # print basic query stats
-        self._print('Got {} rows.\n'.format(total_rows))
-
-        return schema, result_pages
-
     def load_data(self, dataframe, dataset_id, table_id, chunksize):
         try:
             from googleapiclient.errors import HttpError
@@ -606,171 +458,7 @@ def delete_and_recreate_table(self, dataset_id, table_id, table_schema):
         table.create(table_id, table_schema)
         sleep(delay)
 
-
-def _parse_data(schema, rows):
-    # see:
-    # http://pandas.pydata.org/pandas-docs/dev/missing_data.html
-    # #missing-data-casting-rules-and-indexing
-    dtype_map = {'FLOAT': np.dtype(float),
-                 'TIMESTAMP': 'M8[ns]'}
-
-    fields = schema['fields']
-    col_types = [field['type'] for field in fields]
-    col_names = [str(field['name']) for field in fields]
-    col_dtypes = [dtype_map.get(field['type'], object) for field in fields]
-    page_array = np.zeros((len(rows),), dtype=lzip(col_names, col_dtypes))
-    for row_num, raw_row in enumerate(rows):
-        entries = raw_row.get('f', [])
-        for col_num, field_type in enumerate(col_types):
-            field_value = _parse_entry(entries[col_num].get('v', ''),
-                                       field_type)
-            page_array[row_num][col_num] = field_value
-
-    return DataFrame(page_array, columns=col_names)
-
-
-def _parse_entry(field_value, field_type):
-    if field_value is None or field_value == 'null':
-        return None
-    if field_type == 'INTEGER':
-        return int(field_value)
-    elif field_type == 'FLOAT':
-        return float(field_value)
-    elif field_type == 'TIMESTAMP':
-        timestamp = datetime.utcfromtimestamp(float(field_value))
-        return np.datetime64(timestamp)
-    elif field_type == 'BOOLEAN':
-        return field_value == 'true'
-    return field_value
-
-
-def read_gbq(query, project_id=None, index_col=None, col_order=None,
-             reauth=False, verbose=True, private_key=None, dialect='legacy',
-             **kwargs):
-    r"""Load data from Google BigQuery.
-
-    The main method a user calls to execute a Query in Google BigQuery
-    and read results into a pandas DataFrame.
-
-    Google BigQuery API Client Library v2 for Python is used.
-    Documentation is available `here
-    <https://developers.google.com/api-client-library/python/apis/bigquery/v2>`__
-
-    Authentication to the Google BigQuery service is via OAuth 2.0.
-
-    - If "private_key" is not provided:
-
-      By default "application default credentials" are used.
-
-      If default application credentials are not found or are restrictive,
-      user account credentials are used. In this case, you will be asked to
-      grant permissions for product name 'pandas GBQ'.
-
-    - If "private_key" is provided:
-
-      Service account credentials will be used to authenticate.
-
-    Parameters
-    ----------
-    query : str
-        SQL-Like Query to return data values
-    project_id : str
-        Google BigQuery Account project ID.
-    index_col : str (optional)
-        Name of result column to use for index in results DataFrame
-    col_order : list(str) (optional)
-        List of BigQuery column names in the desired order for results
-        DataFrame
-    reauth : boolean (default False)
-        Force Google BigQuery to reauthenticate the user. This is useful
-        if multiple accounts are used.
-    verbose : boolean (default True)
-        Verbose output
-    private_key : str (optional)
-        Service account private key in JSON format. Can be file path
-        or string contents. This is useful for remote server
-        authentication (eg. jupyter iPython notebook on remote host)
-
-    dialect : {'legacy', 'standard'}, default 'legacy'
-        'legacy' : Use BigQuery's legacy SQL dialect.
-        'standard' : Use BigQuery's standard SQL (beta), which is
-        compliant with the SQL 2011 standard. For more information
-        see `BigQuery SQL Reference
-        <https://cloud.google.com/bigquery/sql-reference/>`__
-
-    **kwargs : Arbitrary keyword arguments
-        configuration (dict): query config parameters for job processing.
-        For example:
-
-            configuration = {'query': {'useQueryCache': False}}
-
-        For more information see `BigQuery SQL Reference
-        <https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__
-
-    Returns
-    -------
-    df: DataFrame
-        DataFrame representing results of query
-
-    """
-
-    if not project_id:
-        raise TypeError("Missing required parameter: project_id")
-
-    if dialect not in ('legacy', 'standard'):
-        raise ValueError("'{0}' is not valid for dialect".format(dialect))
-
-    connector = GbqConnector(project_id, reauth=reauth, verbose=verbose,
-                             private_key=private_key,
-                             dialect=dialect)
-    schema, pages = connector.run_query(query, **kwargs)
-    dataframe_list = []
-    while len(pages) > 0:
-        page = pages.pop()
-        dataframe_list.append(_parse_data(schema, page))
-
-    if len(dataframe_list) > 0:
-        final_df = concat(dataframe_list, ignore_index=True)
-    else:
-        final_df = _parse_data(schema, [])
-
-    # Reindex the DataFrame on the provided column
-    if index_col is not None:
-        if index_col in final_df.columns:
-            final_df.set_index(index_col, inplace=True)
-        else:
-            raise InvalidIndexColumn(
-                'Index column "{0}" does not exist in DataFrame.'
-                .format(index_col)
-            )
-
-    # Change the order of columns in the DataFrame based on provided list
-    if col_order is not None:
-        if sorted(col_order) == sorted(final_df.columns):
-            final_df = final_df[col_order]
-        else:
-            raise InvalidColumnOrder(
-                'Column order does not match this DataFrame.'
-            )
-
-    # cast BOOLEAN and INTEGER columns from object to bool/int
-    # if they dont have any nulls
-    type_map = {'BOOLEAN': bool, 'INTEGER': int}
-    for field in schema['fields']:
-        if field['type'] in type_map and \
-                final_df[field['name']].notnull().all():
-            final_df[field['name']] = \
-                final_df[field['name']].astype(type_map[field['type']])
-
-    connector.print_elapsed_seconds(
-        'Total time taken',
-        datetime.now().strftime('s.\nFinished at %Y-%m-%d %H:%M:%S.'),
-        0
-    )
-
-    return final_df
-
-def from_gbq(query, project_id=None, index_col=None, col_order=None,
+def read_gbq(query, project_id=None, index_col=None, col_order=None, verbose=True,
              private_key=None, dialect='legacy', configuration=None, **kwargs):
     r"""Load data from Google BigQuery using google-cloud-python
 
@@ -798,6 +486,8 @@ def from_gbq(query, project_id=None, index_col=None, col_order=None,
     col_order : list(str) (optional)
         List of BigQuery column names in the desired order for results
         DataFrame
+    verbose : boolean (default True)
+        Verbose output
     private_key : str (optional)
         Path to service account private key in JSON format. If none is provided,
         will default to the GOOGLE_APPLICATION_CREDENTIALS environment variable
@@ -826,6 +516,15 @@ def from_gbq(query, project_id=None, index_col=None, col_order=None,
 
     """
 
+    # http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
+    def sizeof_fmt(num, suffix='B'):
+        fmt = "%3.1f %s%s"
+        for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
+            if abs(num) < 1024.0:
+                return fmt % (num, unit, suffix)
+            num /= 1024.0
+        return fmt % (num, 'Y', suffix)
+
     if private_key:
         os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = private_key
 
@@ -849,11 +548,31 @@ def _wait_for_job(job):
             setattr(query_job, setting, value)
 
     query_job.begin()
-    _wait_for_job(query_job)
 
+    if verbose:
+        print("Query running...")
+    _wait_for_job(query_job)
+    if verbose:
+        print("Query done.")
+        if query_job._properties["statistics"]["query"].get("cacheHit", False):
+            print("Cache hit.")
+        elif "statistics" in query_job._properties and "query" in query_job._properties["statistics"]:
+            bytes_billed = int(query_job._properties["statistics"]["query"].get("totalBytesProcessed", 0))
+            bytes_processed = int(query_job._properties["statistics"]["query"].get("totalBytesBilled", 0))
+            print("Total bytes billed (processed): %s (%s)" % (sizeof_fmt(bytes_billed),sizeof_fmt(bytes_processed)))
     query_results = query_job.results()
 
+    if verbose:
+        print("\nRetrieving results...")
+
     rows, total_rows, page_token = query_results.fetch_data()
+
+    if verbose:
+        print("Got %s rows.") % total_rows
+        print("\nTotal time taken %s s" % (datetime.utcnow()-query_job.created.replace(tzinfo=None)).seconds)
+        print("Finished at %s." % datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
+        
+
     columns = [field.name for field in query_results.schema]
     data = rows