github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/jdbc.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/jdbc.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """PTransforms for supporting Jdbc in Python pipelines.
    19  
    20    These transforms are currently supported by Beam portable
    21    Flink, Spark, and Dataflow v2 runners.
    22  
    23    **Setup**
    24  
    25    Transforms provided in this module are cross-language transforms
    26    implemented in the Beam Java SDK. During the pipeline construction, Python SDK
    27    will connect to a Java expansion service to expand these transforms.
    28    To facilitate this, a small amount of setup is needed before using these
    29    transforms in a Beam Python pipeline.
    30  
    31    There are several ways to setup cross-language Jdbc transforms.
    32  
    33    * Option 1: use the default expansion service
    34    * Option 2: specify a custom expansion service
    35  
    36    See below for details regarding each of these options.
    37  
    38    *Option 1: Use the default expansion service*
    39  
    40    This is the recommended and easiest setup option for using Python Jdbc
    41    transforms. This option is only available for Beam 2.24.0 and later.
    42  
    43    This option requires following pre-requisites before running the Beam
    44    pipeline.
    45  
    46    * Install Java runtime in the computer from where the pipeline is constructed
    47      and make sure that 'java' command is available.
    48  
    49    In this option, Python SDK will either download (for released Beam version) or
    50    build (when running from a Beam Git clone) a expansion service jar and use
    51    that to expand transforms. Currently Jdbc transforms use the
    52    'beam-sdks-java-io-expansion-service' jar for this purpose.
    53  
    54    The transforms in this file support an extra `classpath` argument, where one
    55    can specify any extra JARs to be included in the classpath for the expansion
    56    service. Users will need to specify this option to include the JDBC driver
    57    for the database that you're trying to use. **By default, a Postgres JDBC
    58    driver** is included (i.e. the Java package
    59    `"org.postgresql:postgresql:42.2.16"`).
    60  
    61    *Option 2: specify a custom expansion service*
    62  
    63    In this option, you startup your own expansion service and provide that as
    64    a parameter when using the transforms provided in this module.
    65  
    66    This option requires following pre-requisites before running the Beam
    67    pipeline.
    68  
    69    * Startup your own expansion service.
    70    * Update your pipeline to provide the expansion service address when
    71      initiating Jdbc transforms provided in this module.
    72  
    73    Flink Users can use the built-in Expansion Service of the Flink Runner's
    74    Job Server. If you start Flink's Job Server, the expansion service will be
    75    started on port 8097. For a different address, please set the
    76    expansion_service parameter.
    77  
    78    **More information**
    79  
    80    For more information regarding cross-language transforms see:
    81    - https://beam.apache.org/roadmap/portability/
    82  
    83    For more information specific to Flink runner see:
    84    - https://beam.apache.org/documentation/runners/flink/
    85  """
    86  
    87  # pytype: skip-file
    88  
    89  import typing
    90  
    91  import numpy as np
    92  
    93  from apache_beam.coders import RowCoder
    94  from apache_beam.transforms.external import BeamJarExpansionService
    95  from apache_beam.transforms.external import ExternalTransform
    96  from apache_beam.transforms.external import NamedTupleBasedPayloadBuilder
    97  from apache_beam.typehints.schemas import typing_to_runner_api
    98  
    99  __all__ = [
   100      'WriteToJdbc',
   101      'ReadFromJdbc',
   102  ]
   103  
   104  
   105  def default_io_expansion_service(classpath=None):
   106    return BeamJarExpansionService(
   107        ':sdks:java:extensions:schemaio-expansion-service:shadowJar',
   108        classpath=classpath)
   109  
   110  
   111  JdbcConfigSchema = typing.NamedTuple(
   112      'JdbcConfigSchema',
   113      [('location', str), ('config', bytes)],
   114  )
   115  
   116  Config = typing.NamedTuple(
   117      'Config',
   118      [('driver_class_name', str), ('jdbc_url', str), ('username', str),
   119       ('password', str), ('connection_properties', typing.Optional[str]),
   120       ('connection_init_sqls', typing.Optional[typing.List[str]]),
   121       ('read_query', typing.Optional[str]),
   122       ('write_statement', typing.Optional[str]),
   123       ('fetch_size', typing.Optional[np.int16]),
   124       ('output_parallelization', typing.Optional[bool]),
   125       ('autosharding', typing.Optional[bool]),
   126       ('partition_column', typing.Optional[str]),
   127       ('partitions', typing.Optional[np.int16]),
   128       ('max_connections', typing.Optional[np.int16]),
   129       ('driver_jars', typing.Optional[str])],
   130  )
   131  
   132  DEFAULT_JDBC_CLASSPATH = ['org.postgresql:postgresql:42.2.16']
   133  
   134  
   135  class WriteToJdbc(ExternalTransform):
   136    """A PTransform which writes Rows to the specified database via JDBC.
   137  
   138    This transform receives Rows defined as NamedTuple type and registered in
   139    the coders registry, e.g.::
   140  
   141      ExampleRow = typing.NamedTuple('ExampleRow',
   142                                     [('id', int), ('name', unicode)])
   143      coders.registry.register_coder(ExampleRow, coders.RowCoder)
   144  
   145      with TestPipeline() as p:
   146        _ = (
   147            p
   148            | beam.Create([ExampleRow(1, 'abc')])
   149                .with_output_types(ExampleRow)
   150            | 'Write to jdbc' >> WriteToJdbc(
   151                table_name='jdbc_external_test_write'
   152                driver_class_name='org.postgresql.Driver',
   153                jdbc_url='jdbc:postgresql://localhost:5432/example',
   154                username='postgres',
   155                password='postgres',
   156            ))
   157  
   158    table_name is a required paramater, and by default, the write_statement is
   159    generated from it.
   160  
   161    The generated write_statement can be overridden by passing in a
   162    write_statment.
   163  
   164  
   165    Experimental; no backwards compatibility guarantees.
   166    """
   167  
   168    URN = 'beam:transform:org.apache.beam:schemaio_jdbc_write:v1'
   169  
   170    def __init__(
   171        self,
   172        table_name,
   173        driver_class_name,
   174        jdbc_url,
   175        username,
   176        password,
   177        statement=None,
   178        connection_properties=None,
   179        connection_init_sqls=None,
   180        autosharding=False,
   181        max_connections=None,
   182        driver_jars=None,
   183        expansion_service=None,
   184        classpath=None,
   185    ):
   186      """
   187      Initializes a write operation to Jdbc.
   188  
   189      :param driver_class_name: name of the jdbc driver class
   190      :param jdbc_url: full jdbc url to the database.
   191      :param username: database username
   192      :param password: database password
   193      :param statement: sql statement to be executed
   194      :param connection_properties: properties of the jdbc connection
   195                                    passed as string with format
   196                                    [propertyName=property;]*
   197      :param connection_init_sqls: required only for MySql and MariaDB.
   198                                   passed as list of strings
   199      :param autosharding: enable automatic re-sharding of bundles to scale the
   200                           number of shards with the number of workers.
   201      :param max_connections: sets the maximum total number of connections.
   202                              use a negative value for no limit.
   203      :param driver_jars: comma separated paths for JDBC drivers. if not
   204                          specified, the default classloader is used to load the
   205                          driver jars.
   206      :param expansion_service: The address (host:port) of the ExpansionService.
   207      :param classpath: A list of JARs or Java packages to include in the
   208                        classpath for the expansion service. This option is
   209                        usually needed for `jdbc` to include extra JDBC driver
   210                        packages.
   211                        The packages can be in these three formats: (1) A local
   212                        file, (2) A URL, (3) A gradle-style identifier of a Maven
   213                        package (e.g. "org.postgresql:postgresql:42.3.1").
   214                        By default, this argument includes a Postgres SQL JDBC
   215                        driver.
   216      """
   217      classpath = classpath or DEFAULT_JDBC_CLASSPATH
   218      super().__init__(
   219          self.URN,
   220          NamedTupleBasedPayloadBuilder(
   221              JdbcConfigSchema(
   222                  location=table_name,
   223                  config=RowCoder(
   224                      typing_to_runner_api(Config).row_type.schema).encode(
   225                          Config(
   226                              driver_class_name=driver_class_name,
   227                              jdbc_url=jdbc_url,
   228                              username=username,
   229                              password=password,
   230                              connection_properties=connection_properties,
   231                              connection_init_sqls=connection_init_sqls,
   232                              write_statement=statement,
   233                              read_query=None,
   234                              fetch_size=None,
   235                              output_parallelization=None,
   236                              autosharding=autosharding,
   237                              max_connections=max_connections,
   238                              driver_jars=driver_jars,
   239                              partitions=None,
   240                              partition_column=None))),
   241          ),
   242          expansion_service or default_io_expansion_service(classpath),
   243      )
   244  
   245  
   246  class ReadFromJdbc(ExternalTransform):
   247    """A PTransform which reads Rows from the specified database via JDBC.
   248  
   249    This transform delivers Rows defined as NamedTuple registered in
   250    the coders registry, e.g.::
   251  
   252      ExampleRow = typing.NamedTuple('ExampleRow',
   253                                     [('id', int), ('name', unicode)])
   254      coders.registry.register_coder(ExampleRow, coders.RowCoder)
   255  
   256      with TestPipeline() as p:
   257        result = (
   258            p
   259            | 'Read from jdbc' >> ReadFromJdbc(
   260                table_name='jdbc_external_test_read'
   261                driver_class_name='org.postgresql.Driver',
   262                jdbc_url='jdbc:postgresql://localhost:5432/example',
   263                username='postgres',
   264                password='postgres',
   265            ))
   266  
   267    table_name is a required paramater, and by default, the read_query is
   268    generated from it.
   269  
   270    The generated read_query can be overridden by passing in a read_query.
   271  
   272    Experimental; no backwards compatibility guarantees.
   273    """
   274  
   275    URN = 'beam:transform:org.apache.beam:schemaio_jdbc_read:v1'
   276  
   277    def __init__(
   278        self,
   279        table_name,
   280        driver_class_name,
   281        jdbc_url,
   282        username,
   283        password,
   284        query=None,
   285        output_parallelization=None,
   286        fetch_size=None,
   287        partition_column=None,
   288        partitions=None,
   289        connection_properties=None,
   290        connection_init_sqls=None,
   291        max_connections=None,
   292        driver_jars=None,
   293        expansion_service=None,
   294        classpath=None,
   295    ):
   296      """
   297      Initializes a read operation from Jdbc.
   298  
   299      :param driver_class_name: name of the jdbc driver class
   300      :param jdbc_url: full jdbc url to the database.
   301      :param username: database username
   302      :param password: database password
   303      :param query: sql query to be executed
   304      :param output_parallelization: is output parallelization on
   305      :param fetch_size: how many rows to fetch
   306      :param partition_column: enable partitioned reads by splitting on this
   307                               column
   308      :param partitions: override the default number of splits when using
   309                         partition_column
   310      :param connection_properties: properties of the jdbc connection
   311                                    passed as string with format
   312                                    [propertyName=property;]*
   313      :param connection_init_sqls: required only for MySql and MariaDB.
   314                                   passed as list of strings
   315      :param max_connections: sets the maximum total number of connections.
   316                              use a negative value for no limit.
   317      :param driver_jars: comma separated paths for JDBC drivers. if not
   318                          specified, the default classloader is used to load the
   319                          driver jars.
   320      :param expansion_service: The address (host:port) of the ExpansionService.
   321      :param classpath: A list of JARs or Java packages to include in the
   322                        classpath for the expansion service. This option is
   323                        usually needed for `jdbc` to include extra JDBC driver
   324                        packages.
   325                        The packages can be in these three formats: (1) A local
   326                        file, (2) A URL, (3) A gradle-style identifier of a Maven
   327                        package (e.g. "org.postgresql:postgresql:42.3.1").
   328                        By default, this argument includes a Postgres SQL JDBC
   329                        driver.
   330      """
   331      classpath = classpath or DEFAULT_JDBC_CLASSPATH
   332      super().__init__(
   333          self.URN,
   334          NamedTupleBasedPayloadBuilder(
   335              JdbcConfigSchema(
   336                  location=table_name,
   337                  config=RowCoder(
   338                      typing_to_runner_api(Config).row_type.schema).encode(
   339                          Config(
   340                              driver_class_name=driver_class_name,
   341                              jdbc_url=jdbc_url,
   342                              username=username,
   343                              password=password,
   344                              connection_properties=connection_properties,
   345                              connection_init_sqls=connection_init_sqls,
   346                              write_statement=None,
   347                              read_query=query,
   348                              fetch_size=fetch_size,
   349                              output_parallelization=output_parallelization,
   350                              autosharding=None,
   351                              max_connections=max_connections,
   352                              driver_jars=driver_jars,
   353                              partition_column=partition_column,
   354                              partitions=partitions))),
   355          ),
   356          expansion_service or default_io_expansion_service(classpath),
   357      )