github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/io.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 """Sources and sinks for the Beam DataFrame API. 18 19 Sources 20 ####### 21 This module provides analogs for pandas ``read`` methods, like 22 :func:`pandas.read_csv`. However Beam sources like :func:`read_csv` 23 create a Beam :class:`~apache_beam.PTransform`, and return a 24 :class:`~apache_beam.dataframe.frames.DeferredDataFrame` or 25 :class:`~apache_beam.dataframe.frames.DeferredSeries` representing the contents 26 of the referenced file(s) or data source. 27 28 The result of these methods must be applied to a :class:`~apache_beam.Pipeline` 29 object, for example:: 30 31 df = p | beam.dataframe.io.read_csv(...) 32 33 Sinks 34 ##### 35 This module also defines analogs for pandas sink, or ``to``, methods that 36 generate a Beam :class:`~apache_beam.PTransform`. Users should prefer calling 37 these operations from :class:`~apache_beam.dataframe.frames.DeferredDataFrame` 38 instances (for example with 39 :meth:`DeferredDataFrame.to_csv 40 <apache_beam.dataframe.frames.DeferredDataFrame.to_csv>`). 41 """ 42 43 import itertools 44 import math 45 import re 46 from io import BytesIO 47 from io import StringIO 48 from io import TextIOWrapper 49 50 import pandas as pd 51 52 import apache_beam as beam 53 from apache_beam import io 54 from apache_beam.dataframe import frame_base 55 from apache_beam.io import fileio 56 57 _DEFAULT_LINES_CHUNKSIZE = 10_000 58 _DEFAULT_BYTES_CHUNKSIZE = 1 << 20 59 60 61 def read_gbq( 62 table, dataset=None, project_id=None, use_bqstorage_api=False, **kwargs): 63 """This function reads data from a BigQuery table and produces a 64 :class:`~apache_beam.dataframe.frames.DeferredDataFrame. 65 66 Args: 67 table (str): Please specify a table. This can be done in the format 68 'PROJECT:dataset.table' if one would not wish to utilize 69 the parameters below. 70 dataset (str): Please specify the dataset 71 (can omit if table was specified as 'PROJECT:dataset.table'). 72 project_id (str): Please specify the project ID 73 (can omit if table was specified as 'PROJECT:dataset.table'). 74 use_bqstorage_api (bool): If you would like to utilize 75 the BigQuery Storage API in ReadFromBigQuery, please set 76 this flag to true. Otherwise, please set flag 77 to false or leave it unspecified. 78 """ 79 if table is None: 80 raise ValueError("Please specify a BigQuery table to read from.") 81 elif len(kwargs) > 0: 82 raise ValueError( 83 f"Encountered unsupported parameter(s) in read_gbq: {kwargs.keys()!r}" 84 "") 85 return _ReadGbq(table, dataset, project_id, use_bqstorage_api) 86 87 88 @frame_base.with_docs_from(pd) 89 def read_csv(path, *args, splittable=False, **kwargs): 90 """If your files are large and records do not contain quoted newlines, you may 91 pass the extra argument ``splittable=True`` to enable dynamic splitting for 92 this read on newlines. Using this option for records that do contain quoted 93 newlines may result in partial records and data corruption.""" 94 if 'nrows' in kwargs: 95 raise ValueError('nrows not yet supported') 96 return _ReadFromPandas( 97 pd.read_csv, 98 path, 99 args, 100 kwargs, 101 incremental=True, 102 splitter=_TextFileSplitter(args, kwargs) if splittable else None) 103 104 105 def _as_pc(df, label=None): 106 from apache_beam.dataframe import convert # avoid circular import 107 # TODO(roberwb): Amortize the computation for multiple writes? 108 return convert.to_pcollection(df, yield_elements='pandas', label=label) 109 110 111 @frame_base.with_docs_from(pd.DataFrame) 112 def to_csv(df, path, transform_label=None, *args, **kwargs): 113 label_pc = f"{transform_label} - ToPCollection" if transform_label \ 114 else f"ToPCollection(df) - {path}" 115 label_pd = f"{transform_label} - ToPandasDataFrame" if transform_label \ 116 else f"WriteToPandas(df) - {path}" 117 return _as_pc(df, label_pc) | label_pd >> _WriteToPandas( 118 'to_csv', path, args, kwargs, incremental=True, binary=False) 119 120 121 @frame_base.with_docs_from(pd) 122 def read_fwf(path, *args, **kwargs): 123 return _ReadFromPandas( 124 pd.read_fwf, 125 path, 126 args, 127 kwargs, 128 incremental=True, 129 binary=False, 130 splitter=_TextFileSplitter(args, kwargs)) 131 132 133 @frame_base.with_docs_from(pd) 134 def read_json(path, *args, **kwargs): 135 if 'nrows' in kwargs: 136 raise NotImplementedError('nrows not yet supported') 137 elif kwargs.get('lines', False): 138 # Work around https://github.com/pandas-dev/pandas/issues/34548. 139 kwargs = dict(kwargs, nrows=1 << 63) 140 return _ReadFromPandas( 141 pd.read_json, 142 path, 143 args, 144 kwargs, 145 incremental=kwargs.get('lines', False), 146 splitter=_DelimSplitter(b'\n', _DEFAULT_BYTES_CHUNKSIZE) if kwargs.get( 147 'lines', False) else None, 148 binary=False) 149 150 151 @frame_base.with_docs_from(pd.DataFrame) 152 def to_json(df, path, orient=None, *args, **kwargs): 153 if orient is None: 154 if isinstance(df._expr.proxy(), pd.DataFrame): 155 orient = 'columns' 156 elif isinstance(df._expr.proxy(), pd.Series): 157 orient = 'index' 158 else: 159 raise frame_base.WontImplementError('not dataframes or series') 160 kwargs['orient'] = orient 161 return _as_pc(df) | _WriteToPandas( 162 'to_json', 163 path, 164 args, 165 kwargs, 166 incremental=orient in ('index', 'records', 'values'), 167 binary=False) 168 169 170 @frame_base.with_docs_from(pd) 171 def read_html(path, *args, **kwargs): 172 return _ReadFromPandas( 173 lambda *args, 174 **kwargs: pd.read_html(*args, **kwargs)[0], 175 path, 176 args, 177 kwargs) 178 179 180 @frame_base.with_docs_from(pd.DataFrame) 181 def to_html(df, path, *args, **kwargs): 182 return _as_pc(df) | _WriteToPandas( 183 'to_html', 184 path, 185 args, 186 kwargs, 187 incremental=( 188 df._expr.proxy().index.nlevels == 1 or 189 not kwargs.get('sparsify', True)), 190 binary=False) 191 192 193 def _binary_reader(format): 194 func = getattr(pd, 'read_%s' % format) 195 result = lambda path, *args, **kwargs: _ReadFromPandas(func, path, args, 196 kwargs) 197 result.__name__ = f'read_{format}' 198 199 return result 200 201 202 def _binary_writer(format): 203 result = ( 204 lambda df, 205 path, 206 *args, 207 **kwargs: _as_pc(df) | _WriteToPandas(f'to_{format}', path, args, kwargs)) 208 result.__name__ = f'to_{format}' 209 return result 210 211 212 for format in ('excel', 'feather', 'parquet', 'stata'): 213 globals()['read_%s' % format] = frame_base.with_docs_from(pd)( 214 _binary_reader(format)) 215 globals()['to_%s' % format] = frame_base.with_docs_from(pd.DataFrame)( 216 _binary_writer(format)) 217 218 for format in ('sas', 'spss'): 219 if hasattr(pd, 'read_%s' % format): # Depends on pandas version. 220 globals()['read_%s' % format] = frame_base.with_docs_from(pd)( 221 _binary_reader(format)) 222 223 read_clipboard = frame_base.not_implemented_method( 224 'read_clipboard', base_type=pd) 225 to_clipboard = frame_base.not_implemented_method( 226 'to_clipboard', base_type=pd.DataFrame) 227 read_msgpack = frame_base.wont_implement_method( 228 pd, 'read_msgpack', reason="deprecated") 229 to_msgpack = frame_base.wont_implement_method( 230 pd.DataFrame, 'to_msgpack', reason="deprecated") 231 read_hdf = frame_base.wont_implement_method( 232 pd, 'read_hdf', explanation="because HDF5 is a random access file format") 233 to_hdf = frame_base.wont_implement_method( 234 pd.DataFrame, 235 'to_hdf', 236 explanation="because HDF5 is a random access file format") 237 238 for name in dir(pd): 239 if name.startswith('read_') and name not in globals(): 240 globals()[name] = frame_base.not_implemented_method(name, base_type=pd) 241 242 243 def _shift_range_index(offset, df): 244 if isinstance(df.index, pd.RangeIndex): 245 return df.set_index(df.index + offset) 246 else: 247 return df 248 249 250 class _ReadFromPandas(beam.PTransform): 251 def __init__( 252 self, 253 reader, 254 path, 255 args, 256 kwargs, 257 binary=True, 258 incremental=False, 259 splitter=False): 260 if 'compression' in kwargs: 261 raise NotImplementedError('compression') 262 if not isinstance(path, str): 263 raise frame_base.WontImplementError('non-deferred') 264 self.reader = reader 265 self.path = path 266 self.args = args 267 self.kwargs = kwargs 268 self.binary = binary 269 self.incremental = incremental 270 self.splitter = splitter 271 272 def expand(self, root): 273 paths_pcoll = root | beam.Create([self.path]) 274 match = io.filesystems.FileSystems.match([self.path], limits=[1])[0] 275 if not match.metadata_list: 276 # TODO(https://github.com/apache/beam/issues/20858): This should be 277 # allowed for streaming pipelines if user provides an explicit schema. 278 raise FileNotFoundError(f"Found no files that match {self.path!r}") 279 first_path = match.metadata_list[0].path 280 with io.filesystems.FileSystems.open(first_path) as handle: 281 if not self.binary: 282 handle = TextIOWrapper(handle) 283 if self.incremental: 284 sample = next( 285 self.reader(handle, *self.args, **dict(self.kwargs, chunksize=100))) 286 else: 287 sample = self.reader(handle, *self.args, **self.kwargs) 288 289 matches_pcoll = paths_pcoll | fileio.MatchAll() 290 indices_pcoll = ( 291 matches_pcoll.pipeline 292 | 'DoOnce' >> beam.Create([None]) 293 | beam.Map( 294 lambda _, 295 paths: {path: ix 296 for ix, path in enumerate(sorted(paths))}, 297 paths=beam.pvalue.AsList( 298 matches_pcoll | beam.Map(lambda match: match.path)))) 299 300 pcoll = ( 301 matches_pcoll 302 | beam.Reshuffle() 303 | fileio.ReadMatches() 304 | beam.ParDo( 305 _ReadFromPandasDoFn( 306 self.reader, 307 self.args, 308 self.kwargs, 309 self.binary, 310 self.incremental, 311 self.splitter), 312 path_indices=beam.pvalue.AsSingleton(indices_pcoll))) 313 from apache_beam.dataframe import convert 314 return convert.to_dataframe(pcoll, proxy=sample[:0]) 315 316 317 class _Splitter: 318 def empty_buffer(self): 319 """Returns an empty buffer of the right type (string or bytes). 320 """ 321 raise NotImplementedError(self) 322 323 def read_header(self, handle): 324 """Reads the header from handle, which points to the start of the file. 325 326 Returns the pair (header, buffer) where buffer contains any part of the 327 file that was "overread" from handle while seeking the end of header. 328 """ 329 raise NotImplementedError(self) 330 331 def read_to_record_boundary(self, buffered, handle): 332 """Reads the given handle up to the end of the current record. 333 334 The buffer argument represents bytes that were read previously; logically 335 it's as if these were pushed back into handle for reading. If the 336 record end is within buffered, it's possible that no more bytes will be read 337 from handle at all. 338 339 Returns the pair (remaining_record_bytes, buffer) where buffer contains 340 any part of the file that was "overread" from handle while seeking the end 341 of the record. 342 """ 343 raise NotImplementedError(self) 344 345 346 class _DelimSplitter(_Splitter): 347 """A _Splitter that splits on delimiters between records. 348 349 This delimiter is assumed ot never occur within a record. 350 """ 351 def __init__(self, delim, read_chunk_size=_DEFAULT_BYTES_CHUNKSIZE): 352 # Multi-char delimiters would require more care across chunk boundaries. 353 assert len(delim) == 1 354 self._delim = delim 355 self._empty = delim[:0] 356 self._read_chunk_size = read_chunk_size 357 358 def empty_buffer(self): 359 return self._empty 360 361 def read_header(self, handle): 362 return self._empty, self._empty 363 364 def read_to_record_boundary(self, buffered, handle): 365 if self._delim in buffered: 366 ix = buffered.index(self._delim) + len(self._delim) 367 return buffered[:ix], buffered[ix:] 368 else: 369 while True: 370 chunk = handle.read(self._read_chunk_size) 371 if self._delim in chunk: 372 ix = chunk.index(self._delim) + len(self._delim) 373 return buffered + chunk[:ix], chunk[ix:] 374 elif not chunk: 375 return buffered, self._empty 376 else: 377 buffered += chunk 378 379 380 def _maybe_encode(str_or_bytes): 381 if isinstance(str_or_bytes, str): 382 return str_or_bytes.encode('utf-8') 383 else: 384 return str_or_bytes 385 386 387 class _TextFileSplitter(_DelimSplitter): 388 """Splitter for dynamically sharding CSV files and newline record boundaries. 389 390 Currently does not handle quoted newlines, so is off by default, but such 391 support could be added in the future. 392 """ 393 def __init__(self, args, kwargs, read_chunk_size=_DEFAULT_BYTES_CHUNKSIZE): 394 if args: 395 # TODO(robertwb): Automatically populate kwargs as we do for df methods. 396 raise ValueError( 397 'Non-path arguments must be passed by keyword ' 398 'for splittable csv reads.') 399 if kwargs.get('skipfooter', 0): 400 raise ValueError('Splittablility incompatible with skipping footers.') 401 super().__init__( 402 _maybe_encode(kwargs.get('lineterminator', b'\n')), 403 _DEFAULT_BYTES_CHUNKSIZE) 404 self._kwargs = kwargs 405 406 def read_header(self, handle): 407 if self._kwargs.get('header', 'infer') == 'infer': 408 if 'names' in self._kwargs: 409 header = None 410 else: 411 header = 0 412 else: 413 header = self._kwargs['header'] 414 415 if header is None: 416 return self._empty, self._empty 417 418 if isinstance(header, int): 419 max_header = header 420 else: 421 max_header = max(header) 422 423 skiprows = self._kwargs.get('skiprows', 0) 424 if isinstance(skiprows, int): 425 is_skiprow = lambda ix: ix < skiprows 426 elif callable(skiprows): 427 is_skiprow = skiprows 428 elif skiprows is None: 429 is_skiprow = lambda ix: False 430 else: 431 is_skiprow = lambda ix: ix in skiprows 432 433 comment = _maybe_encode(self._kwargs.get('comment', None)) 434 if comment: 435 is_comment = lambda line: line.startswith(comment) 436 else: 437 is_comment = lambda line: False 438 439 skip_blank_lines = self._kwargs.get('skip_blank_lines', True) 440 if skip_blank_lines: 441 is_blank = lambda line: re.match(rb'^\s*$', line) 442 else: 443 is_blank = lambda line: False 444 445 text_header = b'' 446 rest = b'' 447 skipped = 0 448 for ix in itertools.count(): 449 line, rest = self.read_to_record_boundary(rest, handle) 450 text_header += line 451 if is_skiprow(ix) or is_blank(line) or is_comment(line): 452 skipped += 1 453 continue 454 if ix - skipped == max_header: 455 return text_header, rest 456 457 458 class _TruncatingFileHandle(object): 459 """A wrapper of a file-like object representing the restriction of the 460 underling handle according to the given SDF restriction tracker, breaking 461 the file only after the given delimiter. 462 463 For example, if the underling restriction is [103, 607) and each line were 464 exactly 10 characters long (i.e. every 10th charcter was a newline), then this 465 would give a view of a 500-byte file consisting of bytes bytes 110 to 609 466 (inclusive) of the underlying file. 467 468 As with all SDF trackers, the endpoint may change dynamically during reading. 469 """ 470 def __init__(self, underlying, tracker, splitter): 471 self._underlying = underlying 472 self._tracker = tracker 473 self._splitter = splitter 474 475 self._empty = self._splitter.empty_buffer() 476 self._done = False 477 self._header, self._buffer = self._splitter.read_header(self._underlying) 478 self._buffer_start_pos = len(self._header) 479 self._iterator = None 480 start = self._tracker.current_restriction().start 481 # Seek to first delimiter after the start position. 482 if start > len(self._header): 483 if start > len(self._header) + len(self._buffer): 484 self._buffer_start_pos = start 485 self._buffer = self._empty 486 self._underlying.seek(start) 487 else: 488 self._buffer_start_pos = start 489 self._buffer = self._buffer[start - len(self._header):] 490 skip, self._buffer = self._splitter.read_to_record_boundary( 491 self._buffer, self._underlying) 492 self._buffer_start_pos += len(skip) 493 494 def readable(self): 495 return True 496 497 def writable(self): 498 return False 499 500 def seekable(self): 501 return False 502 503 @property 504 def closed(self): 505 return False 506 507 def __iter__(self): 508 # For pandas is_file_like. 509 return self 510 511 def __next__(self): 512 if self._iterator is None: 513 self._iterator = self._line_iterator() 514 return next(self._iterator) 515 516 def readline(self): 517 # This attribute is checked, but unused, by pandas. 518 return next(self) 519 520 def _line_iterator(self): 521 line_start = 0 522 chunk = self._read() 523 while True: 524 line_end = chunk.find(self._splitter._delim, line_start) 525 while line_end == -1: 526 more = self._read() 527 if not more: 528 if line_start < len(chunk): 529 yield chunk[line_start:] 530 return 531 chunk = chunk[line_start:] + more 532 line_start = 0 533 line_end = chunk.find(self._splitter._delim, line_start) 534 yield chunk[line_start:line_end + 1] 535 line_start = line_end + 1 536 537 def read(self, size=-1): 538 if self._iterator: 539 raise NotImplementedError('Cannot call read after iterating.') 540 return self._read(size) 541 542 def _read(self, size=-1): 543 if self._header: 544 res = self._header 545 self._header = None 546 return res 547 elif self._done: 548 return self._empty 549 elif size == -1: 550 self._buffer += self._underlying.read() 551 elif not self._buffer: 552 self._buffer = self._underlying.read(size) 553 554 if not self._buffer: 555 self._done = True 556 return self._empty 557 558 if self._tracker.try_claim(self._buffer_start_pos + len(self._buffer)): 559 res = self._buffer 560 self._buffer = self._empty 561 self._buffer_start_pos += len(res) 562 else: 563 offset = self._tracker.current_restriction().stop - self._buffer_start_pos 564 if offset <= 0: 565 res = self._empty 566 else: 567 rest, _ = self._splitter.read_to_record_boundary( 568 self._buffer[offset:], self._underlying) 569 res = self._buffer[:offset] + rest 570 self._done = True 571 return res 572 573 574 class _ReadFromPandasDoFn(beam.DoFn, beam.RestrictionProvider): 575 def __init__(self, reader, args, kwargs, binary, incremental, splitter): 576 # avoid pickling issues 577 if reader.__module__.startswith('pandas.'): 578 reader = reader.__name__ 579 self.reader = reader 580 self.args = args 581 self.kwargs = kwargs 582 self.binary = binary 583 self.incremental = incremental 584 self.splitter = splitter 585 586 def initial_restriction(self, readable_file): 587 return beam.io.restriction_trackers.OffsetRange( 588 0, readable_file.metadata.size_in_bytes) 589 590 def restriction_size(self, readable_file, restriction): 591 return restriction.size() 592 593 def create_tracker(self, restriction): 594 tracker = beam.io.restriction_trackers.OffsetRestrictionTracker(restriction) 595 if self.splitter: 596 return tracker 597 else: 598 return beam.io.restriction_trackers.UnsplittableRestrictionTracker( 599 tracker) 600 601 def process( 602 self, readable_file, path_indices, tracker=beam.DoFn.RestrictionParam()): 603 reader = self.reader 604 if isinstance(reader, str): 605 reader = getattr(pd, self.reader) 606 indices_per_file = 10**int(math.log(2**63 // len(path_indices), 10)) 607 if readable_file.metadata.size_in_bytes > indices_per_file: 608 raise RuntimeError( 609 f'Cannot safely index records from {len(path_indices)} files ' 610 f'of size {readable_file.metadata.size_in_bytes} ' 611 f'as their product is greater than 2^63.') 612 start_index = ( 613 tracker.current_restriction().start + 614 path_indices[readable_file.metadata.path] * indices_per_file) 615 with readable_file.open() as handle: 616 if self.incremental: 617 # TODO(robertwb): We could consider trying to get progress for 618 # non-incremental sources that are read linearly, as long as they 619 # don't try to seek. This could be deceptive as progress would 620 # advance to 100% the instant the (large) read was done, discounting 621 # any downstream processing. 622 handle = _TruncatingFileHandle( 623 handle, 624 tracker, 625 splitter=self.splitter or 626 _DelimSplitter(b'\n', _DEFAULT_BYTES_CHUNKSIZE)) 627 if not self.binary: 628 handle = TextIOWrapper(handle) 629 if self.incremental: 630 if 'chunksize' not in self.kwargs: 631 self.kwargs['chunksize'] = _DEFAULT_LINES_CHUNKSIZE 632 frames = reader(handle, *self.args, **self.kwargs) 633 else: 634 frames = [reader(handle, *self.args, **self.kwargs)] 635 for df in frames: 636 yield _shift_range_index(start_index, df) 637 if not self.incremental: 638 # Satisfy the SDF contract by claiming the whole range. 639 # Do this after emitting the frames to avoid advancing progress to 100% 640 # prior to that. 641 tracker.try_claim(tracker.current_restriction().stop) 642 643 644 class _WriteToPandas(beam.PTransform): 645 def __init__( 646 self, writer, path, args, kwargs, incremental=False, binary=True): 647 self.writer = writer 648 self.path = path 649 self.args = args 650 self.kwargs = kwargs 651 self.incremental = incremental 652 self.binary = binary 653 654 def expand(self, pcoll): 655 if 'file_naming' in self.kwargs: 656 dir, name = self.path, '' 657 else: 658 dir, name = io.filesystems.FileSystems.split(self.path) 659 return pcoll | fileio.WriteToFiles( 660 path=dir, 661 shards=self.kwargs.pop('num_shards', None), 662 file_naming=self.kwargs.pop( 663 'file_naming', fileio.default_file_naming(name)), 664 sink=lambda _: _WriteToPandasFileSink( 665 self.writer, self.args, self.kwargs, self.incremental, self.binary)) 666 667 668 class _WriteToPandasFileSink(fileio.FileSink): 669 def __init__(self, writer, args, kwargs, incremental, binary): 670 if 'compression' in kwargs: 671 raise NotImplementedError('compression') 672 self.writer = writer 673 self.args = args 674 self.kwargs = kwargs 675 self.incremental = incremental 676 self.binary = binary 677 self.StringOrBytesIO = BytesIO if binary else StringIO 678 if incremental: 679 self.write = self.write_record_incremental 680 self.flush = self.close_incremental 681 else: 682 self.write = self.buffer_record 683 self.flush = self.flush_buffer 684 685 def open(self, file_handle): 686 self.buffer = [] 687 self.empty = self.header = self.footer = None 688 if not self.binary: 689 file_handle = TextIOWrapper(file_handle) 690 self.file_handle = file_handle 691 692 def write_to(self, df, file_handle=None): 693 non_none_handle = file_handle or self.StringOrBytesIO() 694 getattr(df, self.writer)(non_none_handle, *self.args, **self.kwargs) 695 if file_handle is None: 696 return non_none_handle.getvalue() 697 698 def write_record_incremental(self, value): 699 if self.empty is None: 700 self.empty = self.write_to(value[:0]) 701 if self.header is None and len(value): 702 703 def new_value(ix): 704 if isinstance(ix, tuple): 705 return (new_value(ix[0]), ) + ix[1:] 706 else: 707 return str('x') + '_again' 708 709 def change_index(df): 710 df.index = df.index.map(new_value) 711 return df 712 713 one_row = self.write_to(value[:1]) 714 another_row = self.write_to(change_index(value[:1])) 715 two_rows = self.write_to(pd.concat([value[:1], change_index(value[:1])])) 716 for ix, c in enumerate(self.empty): 717 if one_row[ix] != c: 718 break 719 else: 720 ix = len(self.empty) 721 self.header = self.empty[:ix] 722 self.footer = self.empty[ix:] 723 self.delimiter = two_rows[len(one_row) - len(self.footer):-( 724 len(another_row) - len(self.header)) or None] 725 self.file_handle.write(self.header) 726 self.first = True 727 728 if len(value): 729 if self.first: 730 self.first = False 731 else: 732 self.file_handle.write(self.delimiter) 733 734 # IDEA(robertwb): Construct a "truncating" stream wrapper to avoid the 735 # in-memory copy. 736 rows = self.write_to(value) 737 self.file_handle.write(rows[len(self.header):-len(self.footer) or None]) 738 739 def close_incremental(self): 740 if self.footer is not None: 741 self.file_handle.write(self.footer) 742 elif self.empty is not None: 743 self.file_handle.write(self.empty) 744 self.file_handle.flush() 745 746 def buffer_record(self, value): 747 self.buffer.append(value) 748 749 def flush_buffer(self): 750 if self.buffer: 751 self.write_to(pd.concat(self.buffer), self.file_handle) 752 self.file_handle.flush() 753 754 755 class ReadViaPandas(beam.PTransform): 756 def __init__( 757 self, 758 format, 759 *args, 760 include_indexes=False, 761 objects_as_strings=True, 762 **kwargs): 763 self._reader = globals()['read_%s' % format](*args, **kwargs) 764 self._include_indexes = include_indexes 765 self._objects_as_strings = objects_as_strings 766 767 def expand(self, p): 768 from apache_beam.dataframe import convert # avoid circular import 769 df = p | self._reader 770 if self._objects_as_strings: 771 for col, t in zip(df.columns, df.dtypes): 772 if t == object: 773 df[col] = df[col].astype(pd.StringDtype()) 774 return convert.to_pcollection(df, include_indexes=self._include_indexes) 775 776 777 class WriteViaPandas(beam.PTransform): 778 def __init__(self, format, *args, **kwargs): 779 self._writer_func = globals()['to_%s' % format] 780 self._args = args 781 self._kwargs = kwargs 782 783 def expand(self, pcoll): 784 from apache_beam.dataframe import convert # avoid circular import 785 return { 786 'files_written': self._writer_func( 787 convert.to_dataframe(pcoll), *self._args, **self._kwargs) 788 | beam.Map(lambda file_result: file_result.file_name).with_output_types( 789 str) 790 } 791 792 793 class _ReadGbq(beam.PTransform): 794 """Read data from BigQuery with output type 'BEAM_ROW', 795 then convert it into a deferred dataframe. 796 797 This PTransform wraps the Python ReadFromBigQuery PTransform, 798 and sets the output_type as 'BEAM_ROW' to convert 799 into a Beam Schema. Once applied to a pipeline object, 800 it is passed into the to_dataframe() function to convert the 801 PCollection into a deferred dataframe. 802 803 This PTransform currently does not support queries. 804 805 Args: 806 table (str): The ID of the table. The ID must contain only 807 letters ``a-z``, ``A-Z``, 808 numbers ``0-9``, underscores ``_`` or white spaces. 809 Note that the table argument must contain the entire table 810 reference specified as: ``'PROJECT:DATASET.TABLE'``. 811 use_bq_storage_api (bool): The method to use to read from BigQuery. 812 It may be 'EXPORT' or 813 'DIRECT_READ'. EXPORT invokes a BigQuery export request 814 (https://cloud.google.com/bigquery/docs/exporting-data). 815 'DIRECT_READ' reads 816 directly from BigQuery storage using the BigQuery Read API 817 (https://cloud.google.com/bigquery/docs/reference/storage). If 818 unspecified or set to false, the default is currently utilized (EXPORT). 819 If the flag is set to true, 820 'DIRECT_READ' will be utilized.""" 821 def __init__( 822 self, 823 table=None, 824 dataset_id=None, 825 project_id=None, 826 use_bqstorage_api=None): 827 828 self.table = table 829 self.dataset_id = dataset_id 830 self.project_id = project_id 831 self.use_bqstorage_api = use_bqstorage_api 832 833 def expand(self, root): 834 from apache_beam.dataframe import convert # avoid circular import 835 if self.use_bqstorage_api: 836 method = 'DIRECT_READ' 837 else: 838 method = 'EXPORT' 839 return convert.to_dataframe( 840 root 841 | '_DataFrame_Read_From_BigQuery' >> beam.io.ReadFromBigQuery( 842 table=self.table, 843 dataset=self.dataset_id, 844 project=self.project_id, 845 method=method, 846 output_type='BEAM_ROW'))