# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
import collections
import json
try:
import fastavro
except ImportError: # pragma: NO COVER
fastavro = None
import google.api_core.exceptions
try:
import pandas
except ImportError: # pragma: NO COVER
pandas = None
try:
import pyarrow
except ImportError: # pragma: NO COVER
pyarrow = None
import six
try:
import pyarrow
except ImportError: # pragma: NO COVER
pyarrow = None
from google.cloud.bigquery_storage_v1beta1 import types
_STREAM_RESUMPTION_EXCEPTIONS = (google.api_core.exceptions.ServiceUnavailable,)
_FASTAVRO_REQUIRED = (
"fastavro is required to parse ReadRowResponse messages with Avro bytes."
)
_PANDAS_REQUIRED = "pandas is required to create a DataFrame"
_PYARROW_REQUIRED = (
"pyarrow is required to parse ReadRowResponse messages with Arrow bytes."
)
[docs]class ReadRowsStream(object):
"""A stream of results from a read rows request.
This stream is an iterable of
:class:`~google.cloud.bigquery_storage_v1beta1.types.ReadRowsResponse`.
Iterate over it to fetch all row messages.
If the fastavro library is installed, use the
:func:`~google.cloud.bigquery_storage_v1beta1.reader.ReadRowsStream.rows()`
method to parse all messages into a stream of row dictionaries.
If the pandas and fastavro libraries are installed, use the
:func:`~google.cloud.bigquery_storage_v1beta1.reader.ReadRowsStream.to_dataframe()`
method to parse all messages into a :class:`pandas.DataFrame`.
"""
def __init__(self, wrapped, client, read_position, read_rows_kwargs):
"""Construct a ReadRowsStream.
Args:
wrapped (Iterable[ \
~google.cloud.bigquery_storage_v1beta1.types.ReadRowsResponse \
]):
The ReadRows stream to read.
client ( \
~google.cloud.bigquery_storage_v1beta1.gapic. \
big_query_storage_client.BigQueryStorageClient \
):
A GAPIC client used to reconnect to a ReadRows stream. This
must be the GAPIC client to avoid a circular dependency on
this class.
read_position (Union[ \
dict, \
~google.cloud.bigquery_storage_v1beta1.types.StreamPosition \
]):
Required. Identifier of the position in the stream to start
reading from. The offset requested must be less than the last
row read from ReadRows. Requesting a larger offset is
undefined. If a dict is provided, it must be of the same form
as the protobuf message
:class:`~google.cloud.bigquery_storage_v1beta1.types.StreamPosition`
read_rows_kwargs (dict):
Keyword arguments to use when reconnecting to a ReadRows
stream.
Returns:
Iterable[ \
~google.cloud.bigquery_storage_v1beta1.types.ReadRowsResponse \
]:
A sequence of row messages.
"""
# Make a copy of the read position so that we can update it without
# mutating the original input.
self._position = _copy_stream_position(read_position)
self._client = client
self._wrapped = wrapped
self._read_rows_kwargs = read_rows_kwargs
def __iter__(self):
"""An iterable of messages.
Returns:
Iterable[ \
~google.cloud.bigquery_storage_v1beta1.types.ReadRowsResponse \
]:
A sequence of row messages.
"""
# Infinite loop to reconnect on reconnectable errors while processing
# the row stream.
while True:
try:
for message in self._wrapped:
rowcount = message.row_count
self._position.offset += rowcount
yield message
return # Made it through the whole stream.
except _STREAM_RESUMPTION_EXCEPTIONS:
# Transient error, so reconnect to the stream.
pass
self._reconnect()
def _reconnect(self):
"""Reconnect to the ReadRows stream using the most recent offset."""
self._wrapped = self._client.read_rows(
_copy_stream_position(self._position), **self._read_rows_kwargs
)
[docs] def rows(self, read_session):
"""Iterate over all rows in the stream.
This method requires the fastavro library in order to parse row
messages.
.. warning::
DATETIME columns are not supported. They are currently parsed as
strings in the fastavro library.
Args:
read_session ( \
~google.cloud.bigquery_storage_v1beta1.types.ReadSession \
):
The read session associated with this read rows stream. This
contains the schema, which is required to parse the data
messages.
Returns:
Iterable[Mapping]:
A sequence of rows, represented as dictionaries.
"""
return ReadRowsIterable(self, read_session)
[docs] def to_arrow(self, read_session):
"""Create a :class:`pyarrow.Table` of all rows in the stream.
This method requires the pyarrow library and a stream using the Arrow
format.
Args:
read_session ( \
~google.cloud.bigquery_storage_v1beta1.types.ReadSession \
):
The read session associated with this read rows stream. This
contains the schema, which is required to parse the data
messages.
Returns:
pyarrow.Table:
A table of all rows in the stream.
"""
return self.rows(read_session).to_arrow()
[docs] def to_dataframe(self, read_session, dtypes=None):
"""Create a :class:`pandas.DataFrame` of all rows in the stream.
This method requires the pandas libary to create a data frame and the
fastavro library to parse row messages.
.. warning::
DATETIME columns are not supported. They are currently parsed as
strings.
Args:
read_session ( \
~google.cloud.bigquery_storage_v1beta1.types.ReadSession \
):
The read session associated with this read rows stream. This
contains the schema, which is required to parse the data
messages.
dtypes ( \
Map[str, Union[str, pandas.Series.dtype]] \
):
Optional. A dictionary of column names pandas ``dtype``s. The
provided ``dtype`` is used when constructing the series for
the column specified. Otherwise, the default pandas behavior
is used.
Returns:
pandas.DataFrame:
A data frame of all rows in the stream.
"""
if pandas is None:
raise ImportError(_PANDAS_REQUIRED)
return self.rows(read_session).to_dataframe(dtypes=dtypes)
[docs]class ReadRowsIterable(object):
"""An iterable of rows from a read session.
Args:
reader (google.cloud.bigquery_storage_v1beta1.reader.ReadRowsStream):
A read rows stream.
read_session (google.cloud.bigquery_storage_v1beta1.types.ReadSession):
A read session. This is required because it contains the schema
used in the stream messages.
"""
# This class is modelled after the google.cloud.bigquery.table.RowIterator
# and aims to be API compatible where possible.
def __init__(self, reader, read_session):
self._status = None
self._reader = reader
self._read_session = read_session
self._stream_parser = _StreamParser.from_read_session(self._read_session)
@property
def total_rows(self):
"""int: Number of estimated rows in the current stream.
May change over time.
"""
return getattr(self._status, "estimated_row_count", None)
@property
def pages(self):
"""A generator of all pages in the stream.
Returns:
types.GeneratorType[google.cloud.bigquery_storage_v1beta1.ReadRowsPage]:
A generator of pages.
"""
# Each page is an iterator of rows. But also has num_items, remaining,
# and to_dataframe.
for message in self._reader:
self._status = message.status
yield ReadRowsPage(self._stream_parser, message)
def __iter__(self):
"""Iterator for each row in all pages."""
for page in self.pages:
for row in page:
yield row
[docs] def to_arrow(self):
"""Create a :class:`pyarrow.Table` of all rows in the stream.
This method requires the pyarrow library and a stream using the Arrow
format.
Returns:
pyarrow.Table:
A table of all rows in the stream.
"""
record_batches = []
for page in self.pages:
record_batches.append(page.to_arrow())
return pyarrow.Table.from_batches(record_batches)
[docs] def to_dataframe(self, dtypes=None):
"""Create a :class:`pandas.DataFrame` of all rows in the stream.
This method requires the pandas libary to create a data frame and the
fastavro library to parse row messages.
.. warning::
DATETIME columns are not supported. They are currently parsed as
strings in the fastavro library.
Args:
dtypes ( \
Map[str, Union[str, pandas.Series.dtype]] \
):
Optional. A dictionary of column names pandas ``dtype``s. The
provided ``dtype`` is used when constructing the series for
the column specified. Otherwise, the default pandas behavior
is used.
Returns:
pandas.DataFrame:
A data frame of all rows in the stream.
"""
if pandas is None:
raise ImportError(_PANDAS_REQUIRED)
frames = []
for page in self.pages:
frames.append(page.to_dataframe(dtypes=dtypes))
return pandas.concat(frames)
[docs]class ReadRowsPage(object):
"""An iterator of rows from a read session message.
Args:
stream_parser (google.cloud.bigquery_storage_v1beta1.reader._StreamParser):
A helper for parsing messages into rows.
message (google.cloud.bigquery_storage_v1beta1.types.ReadRowsResponse):
A message of data from a read rows stream.
"""
# This class is modeled after google.api_core.page_iterator.Page and aims
# to provide API compatibility where possible.
def __init__(self, stream_parser, message):
self._stream_parser = stream_parser
self._message = message
self._iter_rows = None
self._num_items = self._message.row_count
self._remaining = self._message.row_count
def _parse_rows(self):
"""Parse rows from the message only once."""
if self._iter_rows is not None:
return
rows = self._stream_parser.to_rows(self._message)
self._iter_rows = iter(rows)
@property
def num_items(self):
"""int: Total items in the page."""
return self._num_items
@property
def remaining(self):
"""int: Remaining items in the page."""
return self._remaining
def __iter__(self):
"""A ``ReadRowsPage`` is an iterator."""
return self
[docs] def next(self):
"""Get the next row in the page."""
self._parse_rows()
if self._remaining > 0:
self._remaining -= 1
return six.next(self._iter_rows)
# Alias needed for Python 2/3 support.
__next__ = next
[docs] def to_arrow(self):
"""Create an :class:`pyarrow.RecordBatch` of rows in the page.
Returns:
pyarrow.RecordBatch:
Rows from the message, as an Arrow record batch.
"""
return self._stream_parser.to_arrow(self._message)
[docs] def to_dataframe(self, dtypes=None):
"""Create a :class:`pandas.DataFrame` of rows in the page.
This method requires the pandas libary to create a data frame and the
fastavro library to parse row messages.
.. warning::
DATETIME columns are not supported. They are currently parsed as
strings in the fastavro library.
Args:
dtypes ( \
Map[str, Union[str, pandas.Series.dtype]] \
):
Optional. A dictionary of column names pandas ``dtype``s. The
provided ``dtype`` is used when constructing the series for
the column specified. Otherwise, the default pandas behavior
is used.
Returns:
pandas.DataFrame:
A data frame of all rows in the stream.
"""
if pandas is None:
raise ImportError(_PANDAS_REQUIRED)
return self._stream_parser.to_dataframe(self._message, dtypes=dtypes)
class _StreamParser(object):
def to_arrow(self, message):
raise NotImplementedError("Not implemented.")
def to_dataframe(self, message, dtypes=None):
raise NotImplementedError("Not implemented.")
def to_rows(self, message):
raise NotImplementedError("Not implemented.")
@staticmethod
def from_read_session(read_session):
schema_type = read_session.WhichOneof("schema")
if schema_type == "avro_schema":
return _AvroStreamParser(read_session)
elif schema_type == "arrow_schema":
return _ArrowStreamParser(read_session)
else:
raise TypeError(
"Unsupported schema type in read_session: {0}".format(schema_type)
)
class _AvroStreamParser(_StreamParser):
"""Helper to parse Avro messages into useful representations."""
def __init__(self, read_session):
"""Construct an _AvroStreamParser.
Args:
read_session (google.cloud.bigquery_storage_v1beta1.types.ReadSession):
A read session. This is required because it contains the schema
used in the stream messages.
"""
if fastavro is None:
raise ImportError(_FASTAVRO_REQUIRED)
self._read_session = read_session
self._avro_schema_json = None
self._fastavro_schema = None
self._column_names = None
def to_arrow(self, message):
"""Create an :class:`pyarrow.RecordBatch` of rows in the page.
Args:
message (google.cloud.bigquery_storage_v1beta1.types.ReadRowsResponse):
Protocol buffer from the read rows stream, to convert into an
Arrow record batch.
Returns:
pyarrow.RecordBatch:
Rows from the message, as an Arrow record batch.
"""
raise NotImplementedError("to_arrow not implemented for Avro streams.")
def to_dataframe(self, message, dtypes=None):
"""Create a :class:`pandas.DataFrame` of rows in the page.
This method requires the pandas libary to create a data frame and the
fastavro library to parse row messages.
.. warning::
DATETIME columns are not supported. They are currently parsed as
strings in the fastavro library.
Args:
dtypes ( \
Map[str, Union[str, pandas.Series.dtype]] \
):
Optional. A dictionary of column names pandas ``dtype``s. The
provided ``dtype`` is used when constructing the series for
the column specified. Otherwise, the default pandas behavior
is used.
Returns:
pandas.DataFrame:
A data frame of all rows in the stream.
"""
self._parse_avro_schema()
if dtypes is None:
dtypes = {}
columns = collections.defaultdict(list)
for row in self.to_rows(message):
for column in row:
columns[column].append(row[column])
for column in dtypes:
columns[column] = pandas.Series(columns[column], dtype=dtypes[column])
return pandas.DataFrame(columns, columns=self._column_names)
def _parse_avro_schema(self):
"""Extract and parse Avro schema from a read session."""
if self._avro_schema_json:
return
self._avro_schema_json = json.loads(self._read_session.avro_schema.schema)
self._column_names = tuple(
(field["name"] for field in self._avro_schema_json["fields"])
)
def _parse_fastavro(self):
"""Convert parsed Avro schema to fastavro format."""
self._parse_avro_schema()
self._fastavro_schema = fastavro.parse_schema(self._avro_schema_json)
def to_rows(self, message):
"""Parse all rows in a stream message.
Args:
message ( \
~google.cloud.bigquery_storage_v1beta1.types.ReadRowsResponse \
):
A message containing Avro bytes to parse into rows.
Returns:
Iterable[Mapping]:
A sequence of rows, represented as dictionaries.
"""
self._parse_fastavro()
messageio = six.BytesIO(message.avro_rows.serialized_binary_rows)
while True:
# Loop in a while loop because schemaless_reader can only read
# a single record.
try:
# TODO: Parse DATETIME into datetime.datetime (no timezone),
# instead of as a string.
yield fastavro.schemaless_reader(messageio, self._fastavro_schema)
except StopIteration:
break # Finished with message
class _ArrowStreamParser(_StreamParser):
def __init__(self, read_session):
if pyarrow is None:
raise ImportError(_PYARROW_REQUIRED)
self._read_session = read_session
self._schema = None
def to_arrow(self, message):
return self._parse_arrow_message(message)
def to_rows(self, message):
record_batch = self._parse_arrow_message(message)
# Iterate through each column simultaneously, and make a dict from the
# row values
for row in zip(*record_batch.columns):
yield dict(zip(self._column_names, row))
def to_dataframe(self, message, dtypes=None):
record_batch = self._parse_arrow_message(message)
if dtypes is None:
dtypes = {}
df = record_batch.to_pandas()
for column in dtypes:
df[column] = pandas.Series(df[column], dtype=dtypes[column])
return df
def _parse_arrow_message(self, message):
self._parse_arrow_schema()
return pyarrow.read_record_batch(
pyarrow.py_buffer(message.arrow_record_batch.serialized_record_batch),
self._schema,
)
def _parse_arrow_schema(self):
if self._schema:
return
self._schema = pyarrow.read_schema(
pyarrow.py_buffer(self._read_session.arrow_schema.serialized_schema)
)
self._column_names = [field.name for field in self._schema]
def _copy_stream_position(position):
"""Copy a StreamPosition.
Args:
position (Union[ \
dict, \
~google.cloud.bigquery_storage_v1beta1.types.StreamPosition \
]):
StreamPostion (or dictionary in StreamPosition format) to copy.
Returns:
~google.cloud.bigquery_storage_v1beta1.types.StreamPosition:
A copy of the input StreamPostion.
"""
if isinstance(position, types.StreamPosition):
output = types.StreamPosition()
output.CopyFrom(position)
return output
return types.StreamPosition(**position)