github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/io.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  """Sources and sinks for the Beam DataFrame API.
    18  
    19  Sources
    20  #######
    21  This module provides analogs for pandas ``read`` methods, like
    22  :func:`pandas.read_csv`. However Beam sources like :func:`read_csv`
    23  create a Beam :class:`~apache_beam.PTransform`, and return a
    24  :class:`~apache_beam.dataframe.frames.DeferredDataFrame` or
    25  :class:`~apache_beam.dataframe.frames.DeferredSeries` representing the contents
    26  of the referenced file(s) or data source.
    27  
    28  The result of these methods must be applied to a :class:`~apache_beam.Pipeline`
    29  object, for example::
    30  
    31      df = p | beam.dataframe.io.read_csv(...)
    32  
    33  Sinks
    34  #####
    35  This module also defines analogs for pandas sink, or ``to``, methods that
    36  generate a Beam :class:`~apache_beam.PTransform`. Users should prefer calling
    37  these operations from :class:`~apache_beam.dataframe.frames.DeferredDataFrame`
    38  instances (for example with
    39  :meth:`DeferredDataFrame.to_csv
    40  <apache_beam.dataframe.frames.DeferredDataFrame.to_csv>`).
    41  """
    42  
    43  import itertools
    44  import math
    45  import re
    46  from io import BytesIO
    47  from io import StringIO
    48  from io import TextIOWrapper
    49  
    50  import pandas as pd
    51  
    52  import apache_beam as beam
    53  from apache_beam import io
    54  from apache_beam.dataframe import frame_base
    55  from apache_beam.io import fileio
    56  
    57  _DEFAULT_LINES_CHUNKSIZE = 10_000
    58  _DEFAULT_BYTES_CHUNKSIZE = 1 << 20
    59  
    60  
    61  def read_gbq(
    62      table, dataset=None, project_id=None, use_bqstorage_api=False, **kwargs):
    63    """This function reads data from a BigQuery table and produces a
    64    :class:`~apache_beam.dataframe.frames.DeferredDataFrame.
    65  
    66    Args:
    67      table (str): Please specify a table. This can be done in the format
    68        'PROJECT:dataset.table' if one would not wish to utilize
    69        the parameters below.
    70      dataset (str): Please specify the dataset
    71        (can omit if table was specified as 'PROJECT:dataset.table').
    72      project_id (str): Please specify the project ID
    73        (can omit if table was specified as 'PROJECT:dataset.table').
    74      use_bqstorage_api (bool): If you would like to utilize
    75        the BigQuery Storage API in ReadFromBigQuery, please set
    76        this flag to true. Otherwise, please set flag
    77        to false or leave it unspecified.
    78        """
    79    if table is None:
    80      raise ValueError("Please specify a BigQuery table to read from.")
    81    elif len(kwargs) > 0:
    82      raise ValueError(
    83          f"Encountered unsupported parameter(s) in read_gbq: {kwargs.keys()!r}"
    84          "")
    85    return _ReadGbq(table, dataset, project_id, use_bqstorage_api)
    86  
    87  
    88  @frame_base.with_docs_from(pd)
    89  def read_csv(path, *args, splittable=False, **kwargs):
    90    """If your files are large and records do not contain quoted newlines, you may
    91    pass the extra argument ``splittable=True`` to enable dynamic splitting for
    92    this read on newlines. Using this option for records that do contain quoted
    93    newlines may result in partial records and data corruption."""
    94    if 'nrows' in kwargs:
    95      raise ValueError('nrows not yet supported')
    96    return _ReadFromPandas(
    97        pd.read_csv,
    98        path,
    99        args,
   100        kwargs,
   101        incremental=True,
   102        splitter=_TextFileSplitter(args, kwargs) if splittable else None)
   103  
   104  
   105  def _as_pc(df, label=None):
   106    from apache_beam.dataframe import convert  # avoid circular import
   107    # TODO(roberwb): Amortize the computation for multiple writes?
   108    return convert.to_pcollection(df, yield_elements='pandas', label=label)
   109  
   110  
   111  @frame_base.with_docs_from(pd.DataFrame)
   112  def to_csv(df, path, transform_label=None, *args, **kwargs):
   113    label_pc = f"{transform_label} - ToPCollection" if transform_label \
   114      else f"ToPCollection(df) - {path}"
   115    label_pd = f"{transform_label} - ToPandasDataFrame" if transform_label \
   116      else f"WriteToPandas(df) - {path}"
   117    return _as_pc(df, label_pc) | label_pd >> _WriteToPandas(
   118        'to_csv', path, args, kwargs, incremental=True, binary=False)
   119  
   120  
   121  @frame_base.with_docs_from(pd)
   122  def read_fwf(path, *args, **kwargs):
   123    return _ReadFromPandas(
   124        pd.read_fwf,
   125        path,
   126        args,
   127        kwargs,
   128        incremental=True,
   129        binary=False,
   130        splitter=_TextFileSplitter(args, kwargs))
   131  
   132  
   133  @frame_base.with_docs_from(pd)
   134  def read_json(path, *args, **kwargs):
   135    if 'nrows' in kwargs:
   136      raise NotImplementedError('nrows not yet supported')
   137    elif kwargs.get('lines', False):
   138      # Work around https://github.com/pandas-dev/pandas/issues/34548.
   139      kwargs = dict(kwargs, nrows=1 << 63)
   140    return _ReadFromPandas(
   141        pd.read_json,
   142        path,
   143        args,
   144        kwargs,
   145        incremental=kwargs.get('lines', False),
   146        splitter=_DelimSplitter(b'\n', _DEFAULT_BYTES_CHUNKSIZE) if kwargs.get(
   147            'lines', False) else None,
   148        binary=False)
   149  
   150  
   151  @frame_base.with_docs_from(pd.DataFrame)
   152  def to_json(df, path, orient=None, *args, **kwargs):
   153    if orient is None:
   154      if isinstance(df._expr.proxy(), pd.DataFrame):
   155        orient = 'columns'
   156      elif isinstance(df._expr.proxy(), pd.Series):
   157        orient = 'index'
   158      else:
   159        raise frame_base.WontImplementError('not dataframes or series')
   160    kwargs['orient'] = orient
   161    return _as_pc(df) | _WriteToPandas(
   162        'to_json',
   163        path,
   164        args,
   165        kwargs,
   166        incremental=orient in ('index', 'records', 'values'),
   167        binary=False)
   168  
   169  
   170  @frame_base.with_docs_from(pd)
   171  def read_html(path, *args, **kwargs):
   172    return _ReadFromPandas(
   173        lambda *args,
   174        **kwargs: pd.read_html(*args, **kwargs)[0],
   175        path,
   176        args,
   177        kwargs)
   178  
   179  
   180  @frame_base.with_docs_from(pd.DataFrame)
   181  def to_html(df, path, *args, **kwargs):
   182    return _as_pc(df) | _WriteToPandas(
   183        'to_html',
   184        path,
   185        args,
   186        kwargs,
   187        incremental=(
   188            df._expr.proxy().index.nlevels == 1 or
   189            not kwargs.get('sparsify', True)),
   190        binary=False)
   191  
   192  
   193  def _binary_reader(format):
   194    func = getattr(pd, 'read_%s' % format)
   195    result = lambda path, *args, **kwargs: _ReadFromPandas(func, path, args,
   196                                                           kwargs)
   197    result.__name__ = f'read_{format}'
   198  
   199    return result
   200  
   201  
   202  def _binary_writer(format):
   203    result = (
   204        lambda df,
   205        path,
   206        *args,
   207        **kwargs: _as_pc(df) | _WriteToPandas(f'to_{format}', path, args, kwargs))
   208    result.__name__ = f'to_{format}'
   209    return result
   210  
   211  
   212  for format in ('excel', 'feather', 'parquet', 'stata'):
   213    globals()['read_%s' % format] = frame_base.with_docs_from(pd)(
   214        _binary_reader(format))
   215    globals()['to_%s' % format] = frame_base.with_docs_from(pd.DataFrame)(
   216        _binary_writer(format))
   217  
   218  for format in ('sas', 'spss'):
   219    if hasattr(pd, 'read_%s' % format):  # Depends on pandas version.
   220      globals()['read_%s' % format] = frame_base.with_docs_from(pd)(
   221          _binary_reader(format))
   222  
   223  read_clipboard = frame_base.not_implemented_method(
   224      'read_clipboard', base_type=pd)
   225  to_clipboard = frame_base.not_implemented_method(
   226      'to_clipboard', base_type=pd.DataFrame)
   227  read_msgpack = frame_base.wont_implement_method(
   228      pd, 'read_msgpack', reason="deprecated")
   229  to_msgpack = frame_base.wont_implement_method(
   230      pd.DataFrame, 'to_msgpack', reason="deprecated")
   231  read_hdf = frame_base.wont_implement_method(
   232      pd, 'read_hdf', explanation="because HDF5 is a random access file format")
   233  to_hdf = frame_base.wont_implement_method(
   234      pd.DataFrame,
   235      'to_hdf',
   236      explanation="because HDF5 is a random access file format")
   237  
   238  for name in dir(pd):
   239    if name.startswith('read_') and name not in globals():
   240      globals()[name] = frame_base.not_implemented_method(name, base_type=pd)
   241  
   242  
   243  def _shift_range_index(offset, df):
   244    if isinstance(df.index, pd.RangeIndex):
   245      return df.set_index(df.index + offset)
   246    else:
   247      return df
   248  
   249  
   250  class _ReadFromPandas(beam.PTransform):
   251    def __init__(
   252        self,
   253        reader,
   254        path,
   255        args,
   256        kwargs,
   257        binary=True,
   258        incremental=False,
   259        splitter=False):
   260      if 'compression' in kwargs:
   261        raise NotImplementedError('compression')
   262      if not isinstance(path, str):
   263        raise frame_base.WontImplementError('non-deferred')
   264      self.reader = reader
   265      self.path = path
   266      self.args = args
   267      self.kwargs = kwargs
   268      self.binary = binary
   269      self.incremental = incremental
   270      self.splitter = splitter
   271  
   272    def expand(self, root):
   273      paths_pcoll = root | beam.Create([self.path])
   274      match = io.filesystems.FileSystems.match([self.path], limits=[1])[0]
   275      if not match.metadata_list:
   276        # TODO(https://github.com/apache/beam/issues/20858): This should be
   277        # allowed for streaming pipelines if user provides an explicit schema.
   278        raise FileNotFoundError(f"Found no files that match {self.path!r}")
   279      first_path = match.metadata_list[0].path
   280      with io.filesystems.FileSystems.open(first_path) as handle:
   281        if not self.binary:
   282          handle = TextIOWrapper(handle)
   283        if self.incremental:
   284          sample = next(
   285              self.reader(handle, *self.args, **dict(self.kwargs, chunksize=100)))
   286        else:
   287          sample = self.reader(handle, *self.args, **self.kwargs)
   288  
   289      matches_pcoll = paths_pcoll | fileio.MatchAll()
   290      indices_pcoll = (
   291          matches_pcoll.pipeline
   292          | 'DoOnce' >> beam.Create([None])
   293          | beam.Map(
   294              lambda _,
   295              paths: {path: ix
   296                      for ix, path in enumerate(sorted(paths))},
   297              paths=beam.pvalue.AsList(
   298                  matches_pcoll | beam.Map(lambda match: match.path))))
   299  
   300      pcoll = (
   301          matches_pcoll
   302          | beam.Reshuffle()
   303          | fileio.ReadMatches()
   304          | beam.ParDo(
   305              _ReadFromPandasDoFn(
   306                  self.reader,
   307                  self.args,
   308                  self.kwargs,
   309                  self.binary,
   310                  self.incremental,
   311                  self.splitter),
   312              path_indices=beam.pvalue.AsSingleton(indices_pcoll)))
   313      from apache_beam.dataframe import convert
   314      return convert.to_dataframe(pcoll, proxy=sample[:0])
   315  
   316  
   317  class _Splitter:
   318    def empty_buffer(self):
   319      """Returns an empty buffer of the right type (string or bytes).
   320      """
   321      raise NotImplementedError(self)
   322  
   323    def read_header(self, handle):
   324      """Reads the header from handle, which points to the start of the file.
   325  
   326      Returns the pair (header, buffer) where buffer contains any part of the
   327      file that was "overread" from handle while seeking the end of header.
   328      """
   329      raise NotImplementedError(self)
   330  
   331    def read_to_record_boundary(self, buffered, handle):
   332      """Reads the given handle up to the end of the current record.
   333  
   334      The buffer argument represents bytes that were read previously; logically
   335      it's as if these were pushed back into handle for reading. If the
   336      record end is within buffered, it's possible that no more bytes will be read
   337      from handle at all.
   338  
   339      Returns the pair (remaining_record_bytes, buffer) where buffer contains
   340      any part of the file that was "overread" from handle while seeking the end
   341      of the record.
   342      """
   343      raise NotImplementedError(self)
   344  
   345  
   346  class _DelimSplitter(_Splitter):
   347    """A _Splitter that splits on delimiters between records.
   348  
   349    This delimiter is assumed ot never occur within a record.
   350    """
   351    def __init__(self, delim, read_chunk_size=_DEFAULT_BYTES_CHUNKSIZE):
   352      # Multi-char delimiters would require more care across chunk boundaries.
   353      assert len(delim) == 1
   354      self._delim = delim
   355      self._empty = delim[:0]
   356      self._read_chunk_size = read_chunk_size
   357  
   358    def empty_buffer(self):
   359      return self._empty
   360  
   361    def read_header(self, handle):
   362      return self._empty, self._empty
   363  
   364    def read_to_record_boundary(self, buffered, handle):
   365      if self._delim in buffered:
   366        ix = buffered.index(self._delim) + len(self._delim)
   367        return buffered[:ix], buffered[ix:]
   368      else:
   369        while True:
   370          chunk = handle.read(self._read_chunk_size)
   371          if self._delim in chunk:
   372            ix = chunk.index(self._delim) + len(self._delim)
   373            return buffered + chunk[:ix], chunk[ix:]
   374          elif not chunk:
   375            return buffered, self._empty
   376          else:
   377            buffered += chunk
   378  
   379  
   380  def _maybe_encode(str_or_bytes):
   381    if isinstance(str_or_bytes, str):
   382      return str_or_bytes.encode('utf-8')
   383    else:
   384      return str_or_bytes
   385  
   386  
   387  class _TextFileSplitter(_DelimSplitter):
   388    """Splitter for dynamically sharding CSV files and newline record boundaries.
   389  
   390    Currently does not handle quoted newlines, so is off by default, but such
   391    support could be added in the future.
   392    """
   393    def __init__(self, args, kwargs, read_chunk_size=_DEFAULT_BYTES_CHUNKSIZE):
   394      if args:
   395        # TODO(robertwb): Automatically populate kwargs as we do for df methods.
   396        raise ValueError(
   397            'Non-path arguments must be passed by keyword '
   398            'for splittable csv reads.')
   399      if kwargs.get('skipfooter', 0):
   400        raise ValueError('Splittablility incompatible with skipping footers.')
   401      super().__init__(
   402          _maybe_encode(kwargs.get('lineterminator', b'\n')),
   403          _DEFAULT_BYTES_CHUNKSIZE)
   404      self._kwargs = kwargs
   405  
   406    def read_header(self, handle):
   407      if self._kwargs.get('header', 'infer') == 'infer':
   408        if 'names' in self._kwargs:
   409          header = None
   410        else:
   411          header = 0
   412      else:
   413        header = self._kwargs['header']
   414  
   415      if header is None:
   416        return self._empty, self._empty
   417  
   418      if isinstance(header, int):
   419        max_header = header
   420      else:
   421        max_header = max(header)
   422  
   423      skiprows = self._kwargs.get('skiprows', 0)
   424      if isinstance(skiprows, int):
   425        is_skiprow = lambda ix: ix < skiprows
   426      elif callable(skiprows):
   427        is_skiprow = skiprows
   428      elif skiprows is None:
   429        is_skiprow = lambda ix: False
   430      else:
   431        is_skiprow = lambda ix: ix in skiprows
   432  
   433      comment = _maybe_encode(self._kwargs.get('comment', None))
   434      if comment:
   435        is_comment = lambda line: line.startswith(comment)
   436      else:
   437        is_comment = lambda line: False
   438  
   439      skip_blank_lines = self._kwargs.get('skip_blank_lines', True)
   440      if skip_blank_lines:
   441        is_blank = lambda line: re.match(rb'^\s*$', line)
   442      else:
   443        is_blank = lambda line: False
   444  
   445      text_header = b''
   446      rest = b''
   447      skipped = 0
   448      for ix in itertools.count():
   449        line, rest = self.read_to_record_boundary(rest, handle)
   450        text_header += line
   451        if is_skiprow(ix) or is_blank(line) or is_comment(line):
   452          skipped += 1
   453          continue
   454        if ix - skipped == max_header:
   455          return text_header, rest
   456  
   457  
   458  class _TruncatingFileHandle(object):
   459    """A wrapper of a file-like object representing the restriction of the
   460    underling handle according to the given SDF restriction tracker, breaking
   461    the file only after the given delimiter.
   462  
   463    For example, if the underling restriction is [103, 607) and each line were
   464    exactly 10 characters long (i.e. every 10th charcter was a newline), then this
   465    would give a view of a 500-byte file consisting of bytes bytes 110 to 609
   466    (inclusive) of the underlying file.
   467  
   468    As with all SDF trackers, the endpoint may change dynamically during reading.
   469    """
   470    def __init__(self, underlying, tracker, splitter):
   471      self._underlying = underlying
   472      self._tracker = tracker
   473      self._splitter = splitter
   474  
   475      self._empty = self._splitter.empty_buffer()
   476      self._done = False
   477      self._header, self._buffer = self._splitter.read_header(self._underlying)
   478      self._buffer_start_pos = len(self._header)
   479      self._iterator = None
   480      start = self._tracker.current_restriction().start
   481      # Seek to first delimiter after the start position.
   482      if start > len(self._header):
   483        if start > len(self._header) + len(self._buffer):
   484          self._buffer_start_pos = start
   485          self._buffer = self._empty
   486          self._underlying.seek(start)
   487        else:
   488          self._buffer_start_pos = start
   489          self._buffer = self._buffer[start - len(self._header):]
   490        skip, self._buffer = self._splitter.read_to_record_boundary(
   491            self._buffer, self._underlying)
   492        self._buffer_start_pos += len(skip)
   493  
   494    def readable(self):
   495      return True
   496  
   497    def writable(self):
   498      return False
   499  
   500    def seekable(self):
   501      return False
   502  
   503    @property
   504    def closed(self):
   505      return False
   506  
   507    def __iter__(self):
   508      # For pandas is_file_like.
   509      return self
   510  
   511    def __next__(self):
   512      if self._iterator is None:
   513        self._iterator = self._line_iterator()
   514      return next(self._iterator)
   515  
   516    def readline(self):
   517      # This attribute is checked, but unused, by pandas.
   518      return next(self)
   519  
   520    def _line_iterator(self):
   521      line_start = 0
   522      chunk = self._read()
   523      while True:
   524        line_end = chunk.find(self._splitter._delim, line_start)
   525        while line_end == -1:
   526          more = self._read()
   527          if not more:
   528            if line_start < len(chunk):
   529              yield chunk[line_start:]
   530            return
   531          chunk = chunk[line_start:] + more
   532          line_start = 0
   533          line_end = chunk.find(self._splitter._delim, line_start)
   534        yield chunk[line_start:line_end + 1]
   535        line_start = line_end + 1
   536  
   537    def read(self, size=-1):
   538      if self._iterator:
   539        raise NotImplementedError('Cannot call read after iterating.')
   540      return self._read(size)
   541  
   542    def _read(self, size=-1):
   543      if self._header:
   544        res = self._header
   545        self._header = None
   546        return res
   547      elif self._done:
   548        return self._empty
   549      elif size == -1:
   550        self._buffer += self._underlying.read()
   551      elif not self._buffer:
   552        self._buffer = self._underlying.read(size)
   553  
   554      if not self._buffer:
   555        self._done = True
   556        return self._empty
   557  
   558      if self._tracker.try_claim(self._buffer_start_pos + len(self._buffer)):
   559        res = self._buffer
   560        self._buffer = self._empty
   561        self._buffer_start_pos += len(res)
   562      else:
   563        offset = self._tracker.current_restriction().stop - self._buffer_start_pos
   564        if offset <= 0:
   565          res = self._empty
   566        else:
   567          rest, _ = self._splitter.read_to_record_boundary(
   568              self._buffer[offset:], self._underlying)
   569          res = self._buffer[:offset] + rest
   570        self._done = True
   571      return res
   572  
   573  
   574  class _ReadFromPandasDoFn(beam.DoFn, beam.RestrictionProvider):
   575    def __init__(self, reader, args, kwargs, binary, incremental, splitter):
   576      # avoid pickling issues
   577      if reader.__module__.startswith('pandas.'):
   578        reader = reader.__name__
   579      self.reader = reader
   580      self.args = args
   581      self.kwargs = kwargs
   582      self.binary = binary
   583      self.incremental = incremental
   584      self.splitter = splitter
   585  
   586    def initial_restriction(self, readable_file):
   587      return beam.io.restriction_trackers.OffsetRange(
   588          0, readable_file.metadata.size_in_bytes)
   589  
   590    def restriction_size(self, readable_file, restriction):
   591      return restriction.size()
   592  
   593    def create_tracker(self, restriction):
   594      tracker = beam.io.restriction_trackers.OffsetRestrictionTracker(restriction)
   595      if self.splitter:
   596        return tracker
   597      else:
   598        return beam.io.restriction_trackers.UnsplittableRestrictionTracker(
   599            tracker)
   600  
   601    def process(
   602        self, readable_file, path_indices, tracker=beam.DoFn.RestrictionParam()):
   603      reader = self.reader
   604      if isinstance(reader, str):
   605        reader = getattr(pd, self.reader)
   606      indices_per_file = 10**int(math.log(2**63 // len(path_indices), 10))
   607      if readable_file.metadata.size_in_bytes > indices_per_file:
   608        raise RuntimeError(
   609            f'Cannot safely index records from {len(path_indices)} files '
   610            f'of size {readable_file.metadata.size_in_bytes} '
   611            f'as their product is greater than 2^63.')
   612      start_index = (
   613          tracker.current_restriction().start +
   614          path_indices[readable_file.metadata.path] * indices_per_file)
   615      with readable_file.open() as handle:
   616        if self.incremental:
   617          # TODO(robertwb): We could consider trying to get progress for
   618          # non-incremental sources that are read linearly, as long as they
   619          # don't try to seek.  This could be deceptive as progress would
   620          # advance to 100% the instant the (large) read was done, discounting
   621          # any downstream processing.
   622          handle = _TruncatingFileHandle(
   623              handle,
   624              tracker,
   625              splitter=self.splitter or
   626              _DelimSplitter(b'\n', _DEFAULT_BYTES_CHUNKSIZE))
   627        if not self.binary:
   628          handle = TextIOWrapper(handle)
   629        if self.incremental:
   630          if 'chunksize' not in self.kwargs:
   631            self.kwargs['chunksize'] = _DEFAULT_LINES_CHUNKSIZE
   632          frames = reader(handle, *self.args, **self.kwargs)
   633        else:
   634          frames = [reader(handle, *self.args, **self.kwargs)]
   635        for df in frames:
   636          yield _shift_range_index(start_index, df)
   637        if not self.incremental:
   638          # Satisfy the SDF contract by claiming the whole range.
   639          # Do this after emitting the frames to avoid advancing progress to 100%
   640          # prior to that.
   641          tracker.try_claim(tracker.current_restriction().stop)
   642  
   643  
   644  class _WriteToPandas(beam.PTransform):
   645    def __init__(
   646        self, writer, path, args, kwargs, incremental=False, binary=True):
   647      self.writer = writer
   648      self.path = path
   649      self.args = args
   650      self.kwargs = kwargs
   651      self.incremental = incremental
   652      self.binary = binary
   653  
   654    def expand(self, pcoll):
   655      if 'file_naming' in self.kwargs:
   656        dir, name = self.path, ''
   657      else:
   658        dir, name = io.filesystems.FileSystems.split(self.path)
   659      return pcoll | fileio.WriteToFiles(
   660          path=dir,
   661          shards=self.kwargs.pop('num_shards', None),
   662          file_naming=self.kwargs.pop(
   663              'file_naming', fileio.default_file_naming(name)),
   664          sink=lambda _: _WriteToPandasFileSink(
   665              self.writer, self.args, self.kwargs, self.incremental, self.binary))
   666  
   667  
   668  class _WriteToPandasFileSink(fileio.FileSink):
   669    def __init__(self, writer, args, kwargs, incremental, binary):
   670      if 'compression' in kwargs:
   671        raise NotImplementedError('compression')
   672      self.writer = writer
   673      self.args = args
   674      self.kwargs = kwargs
   675      self.incremental = incremental
   676      self.binary = binary
   677      self.StringOrBytesIO = BytesIO if binary else StringIO
   678      if incremental:
   679        self.write = self.write_record_incremental
   680        self.flush = self.close_incremental
   681      else:
   682        self.write = self.buffer_record
   683        self.flush = self.flush_buffer
   684  
   685    def open(self, file_handle):
   686      self.buffer = []
   687      self.empty = self.header = self.footer = None
   688      if not self.binary:
   689        file_handle = TextIOWrapper(file_handle)
   690      self.file_handle = file_handle
   691  
   692    def write_to(self, df, file_handle=None):
   693      non_none_handle = file_handle or self.StringOrBytesIO()
   694      getattr(df, self.writer)(non_none_handle, *self.args, **self.kwargs)
   695      if file_handle is None:
   696        return non_none_handle.getvalue()
   697  
   698    def write_record_incremental(self, value):
   699      if self.empty is None:
   700        self.empty = self.write_to(value[:0])
   701      if self.header is None and len(value):
   702  
   703        def new_value(ix):
   704          if isinstance(ix, tuple):
   705            return (new_value(ix[0]), ) + ix[1:]
   706          else:
   707            return str('x') + '_again'
   708  
   709        def change_index(df):
   710          df.index = df.index.map(new_value)
   711          return df
   712  
   713        one_row = self.write_to(value[:1])
   714        another_row = self.write_to(change_index(value[:1]))
   715        two_rows = self.write_to(pd.concat([value[:1], change_index(value[:1])]))
   716        for ix, c in enumerate(self.empty):
   717          if one_row[ix] != c:
   718            break
   719        else:
   720          ix = len(self.empty)
   721        self.header = self.empty[:ix]
   722        self.footer = self.empty[ix:]
   723        self.delimiter = two_rows[len(one_row) - len(self.footer):-(
   724            len(another_row) - len(self.header)) or None]
   725        self.file_handle.write(self.header)
   726        self.first = True
   727  
   728      if len(value):
   729        if self.first:
   730          self.first = False
   731        else:
   732          self.file_handle.write(self.delimiter)
   733  
   734        # IDEA(robertwb): Construct a "truncating" stream wrapper to avoid the
   735        # in-memory copy.
   736        rows = self.write_to(value)
   737        self.file_handle.write(rows[len(self.header):-len(self.footer) or None])
   738  
   739    def close_incremental(self):
   740      if self.footer is not None:
   741        self.file_handle.write(self.footer)
   742      elif self.empty is not None:
   743        self.file_handle.write(self.empty)
   744      self.file_handle.flush()
   745  
   746    def buffer_record(self, value):
   747      self.buffer.append(value)
   748  
   749    def flush_buffer(self):
   750      if self.buffer:
   751        self.write_to(pd.concat(self.buffer), self.file_handle)
   752        self.file_handle.flush()
   753  
   754  
   755  class ReadViaPandas(beam.PTransform):
   756    def __init__(
   757        self,
   758        format,
   759        *args,
   760        include_indexes=False,
   761        objects_as_strings=True,
   762        **kwargs):
   763      self._reader = globals()['read_%s' % format](*args, **kwargs)
   764      self._include_indexes = include_indexes
   765      self._objects_as_strings = objects_as_strings
   766  
   767    def expand(self, p):
   768      from apache_beam.dataframe import convert  # avoid circular import
   769      df = p | self._reader
   770      if self._objects_as_strings:
   771        for col, t in zip(df.columns, df.dtypes):
   772          if t == object:
   773            df[col] = df[col].astype(pd.StringDtype())
   774      return convert.to_pcollection(df, include_indexes=self._include_indexes)
   775  
   776  
   777  class WriteViaPandas(beam.PTransform):
   778    def __init__(self, format, *args, **kwargs):
   779      self._writer_func = globals()['to_%s' % format]
   780      self._args = args
   781      self._kwargs = kwargs
   782  
   783    def expand(self, pcoll):
   784      from apache_beam.dataframe import convert  # avoid circular import
   785      return {
   786          'files_written': self._writer_func(
   787              convert.to_dataframe(pcoll), *self._args, **self._kwargs)
   788          | beam.Map(lambda file_result: file_result.file_name).with_output_types(
   789              str)
   790      }
   791  
   792  
   793  class _ReadGbq(beam.PTransform):
   794    """Read data from BigQuery with output type 'BEAM_ROW',
   795    then convert it into a deferred dataframe.
   796  
   797      This PTransform wraps the Python ReadFromBigQuery PTransform,
   798      and sets the output_type as 'BEAM_ROW' to convert
   799      into a Beam Schema. Once applied to a pipeline object,
   800      it is passed into the to_dataframe() function to convert the
   801      PCollection into a deferred dataframe.
   802  
   803      This PTransform currently does not support queries.
   804  
   805    Args:
   806      table (str): The ID of the table. The ID must contain only
   807        letters ``a-z``, ``A-Z``,
   808        numbers ``0-9``, underscores ``_`` or white spaces.
   809        Note that the table argument must contain the entire table
   810        reference specified as: ``'PROJECT:DATASET.TABLE'``.
   811      use_bq_storage_api (bool): The method to use to read from BigQuery.
   812        It may be 'EXPORT' or
   813        'DIRECT_READ'. EXPORT invokes a BigQuery export request
   814        (https://cloud.google.com/bigquery/docs/exporting-data).
   815        'DIRECT_READ' reads
   816        directly from BigQuery storage using the BigQuery Read API
   817        (https://cloud.google.com/bigquery/docs/reference/storage). If
   818        unspecified or set to false, the default is currently utilized (EXPORT).
   819        If the flag is set to true,
   820        'DIRECT_READ' will be utilized."""
   821    def __init__(
   822        self,
   823        table=None,
   824        dataset_id=None,
   825        project_id=None,
   826        use_bqstorage_api=None):
   827  
   828      self.table = table
   829      self.dataset_id = dataset_id
   830      self.project_id = project_id
   831      self.use_bqstorage_api = use_bqstorage_api
   832  
   833    def expand(self, root):
   834      from apache_beam.dataframe import convert  # avoid circular import
   835      if self.use_bqstorage_api:
   836        method = 'DIRECT_READ'
   837      else:
   838        method = 'EXPORT'
   839      return convert.to_dataframe(
   840          root
   841          | '_DataFrame_Read_From_BigQuery' >> beam.io.ReadFromBigQuery(
   842              table=self.table,
   843              dataset=self.dataset_id,
   844              project=self.project_id,
   845              method=method,
   846              output_type='BEAM_ROW'))