github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/jdbc.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """PTransforms for supporting Jdbc in Python pipelines. 19 20 These transforms are currently supported by Beam portable 21 Flink, Spark, and Dataflow v2 runners. 22 23 **Setup** 24 25 Transforms provided in this module are cross-language transforms 26 implemented in the Beam Java SDK. During the pipeline construction, Python SDK 27 will connect to a Java expansion service to expand these transforms. 28 To facilitate this, a small amount of setup is needed before using these 29 transforms in a Beam Python pipeline. 30 31 There are several ways to setup cross-language Jdbc transforms. 32 33 * Option 1: use the default expansion service 34 * Option 2: specify a custom expansion service 35 36 See below for details regarding each of these options. 37 38 *Option 1: Use the default expansion service* 39 40 This is the recommended and easiest setup option for using Python Jdbc 41 transforms. This option is only available for Beam 2.24.0 and later. 42 43 This option requires following pre-requisites before running the Beam 44 pipeline. 45 46 * Install Java runtime in the computer from where the pipeline is constructed 47 and make sure that 'java' command is available. 48 49 In this option, Python SDK will either download (for released Beam version) or 50 build (when running from a Beam Git clone) a expansion service jar and use 51 that to expand transforms. Currently Jdbc transforms use the 52 'beam-sdks-java-io-expansion-service' jar for this purpose. 53 54 The transforms in this file support an extra `classpath` argument, where one 55 can specify any extra JARs to be included in the classpath for the expansion 56 service. Users will need to specify this option to include the JDBC driver 57 for the database that you're trying to use. **By default, a Postgres JDBC 58 driver** is included (i.e. the Java package 59 `"org.postgresql:postgresql:42.2.16"`). 60 61 *Option 2: specify a custom expansion service* 62 63 In this option, you startup your own expansion service and provide that as 64 a parameter when using the transforms provided in this module. 65 66 This option requires following pre-requisites before running the Beam 67 pipeline. 68 69 * Startup your own expansion service. 70 * Update your pipeline to provide the expansion service address when 71 initiating Jdbc transforms provided in this module. 72 73 Flink Users can use the built-in Expansion Service of the Flink Runner's 74 Job Server. If you start Flink's Job Server, the expansion service will be 75 started on port 8097. For a different address, please set the 76 expansion_service parameter. 77 78 **More information** 79 80 For more information regarding cross-language transforms see: 81 - https://beam.apache.org/roadmap/portability/ 82 83 For more information specific to Flink runner see: 84 - https://beam.apache.org/documentation/runners/flink/ 85 """ 86 87 # pytype: skip-file 88 89 import typing 90 91 import numpy as np 92 93 from apache_beam.coders import RowCoder 94 from apache_beam.transforms.external import BeamJarExpansionService 95 from apache_beam.transforms.external import ExternalTransform 96 from apache_beam.transforms.external import NamedTupleBasedPayloadBuilder 97 from apache_beam.typehints.schemas import typing_to_runner_api 98 99 __all__ = [ 100 'WriteToJdbc', 101 'ReadFromJdbc', 102 ] 103 104 105 def default_io_expansion_service(classpath=None): 106 return BeamJarExpansionService( 107 ':sdks:java:extensions:schemaio-expansion-service:shadowJar', 108 classpath=classpath) 109 110 111 JdbcConfigSchema = typing.NamedTuple( 112 'JdbcConfigSchema', 113 [('location', str), ('config', bytes)], 114 ) 115 116 Config = typing.NamedTuple( 117 'Config', 118 [('driver_class_name', str), ('jdbc_url', str), ('username', str), 119 ('password', str), ('connection_properties', typing.Optional[str]), 120 ('connection_init_sqls', typing.Optional[typing.List[str]]), 121 ('read_query', typing.Optional[str]), 122 ('write_statement', typing.Optional[str]), 123 ('fetch_size', typing.Optional[np.int16]), 124 ('output_parallelization', typing.Optional[bool]), 125 ('autosharding', typing.Optional[bool]), 126 ('partition_column', typing.Optional[str]), 127 ('partitions', typing.Optional[np.int16]), 128 ('max_connections', typing.Optional[np.int16]), 129 ('driver_jars', typing.Optional[str])], 130 ) 131 132 DEFAULT_JDBC_CLASSPATH = ['org.postgresql:postgresql:42.2.16'] 133 134 135 class WriteToJdbc(ExternalTransform): 136 """A PTransform which writes Rows to the specified database via JDBC. 137 138 This transform receives Rows defined as NamedTuple type and registered in 139 the coders registry, e.g.:: 140 141 ExampleRow = typing.NamedTuple('ExampleRow', 142 [('id', int), ('name', unicode)]) 143 coders.registry.register_coder(ExampleRow, coders.RowCoder) 144 145 with TestPipeline() as p: 146 _ = ( 147 p 148 | beam.Create([ExampleRow(1, 'abc')]) 149 .with_output_types(ExampleRow) 150 | 'Write to jdbc' >> WriteToJdbc( 151 table_name='jdbc_external_test_write' 152 driver_class_name='org.postgresql.Driver', 153 jdbc_url='jdbc:postgresql://localhost:5432/example', 154 username='postgres', 155 password='postgres', 156 )) 157 158 table_name is a required paramater, and by default, the write_statement is 159 generated from it. 160 161 The generated write_statement can be overridden by passing in a 162 write_statment. 163 164 165 Experimental; no backwards compatibility guarantees. 166 """ 167 168 URN = 'beam:transform:org.apache.beam:schemaio_jdbc_write:v1' 169 170 def __init__( 171 self, 172 table_name, 173 driver_class_name, 174 jdbc_url, 175 username, 176 password, 177 statement=None, 178 connection_properties=None, 179 connection_init_sqls=None, 180 autosharding=False, 181 max_connections=None, 182 driver_jars=None, 183 expansion_service=None, 184 classpath=None, 185 ): 186 """ 187 Initializes a write operation to Jdbc. 188 189 :param driver_class_name: name of the jdbc driver class 190 :param jdbc_url: full jdbc url to the database. 191 :param username: database username 192 :param password: database password 193 :param statement: sql statement to be executed 194 :param connection_properties: properties of the jdbc connection 195 passed as string with format 196 [propertyName=property;]* 197 :param connection_init_sqls: required only for MySql and MariaDB. 198 passed as list of strings 199 :param autosharding: enable automatic re-sharding of bundles to scale the 200 number of shards with the number of workers. 201 :param max_connections: sets the maximum total number of connections. 202 use a negative value for no limit. 203 :param driver_jars: comma separated paths for JDBC drivers. if not 204 specified, the default classloader is used to load the 205 driver jars. 206 :param expansion_service: The address (host:port) of the ExpansionService. 207 :param classpath: A list of JARs or Java packages to include in the 208 classpath for the expansion service. This option is 209 usually needed for `jdbc` to include extra JDBC driver 210 packages. 211 The packages can be in these three formats: (1) A local 212 file, (2) A URL, (3) A gradle-style identifier of a Maven 213 package (e.g. "org.postgresql:postgresql:42.3.1"). 214 By default, this argument includes a Postgres SQL JDBC 215 driver. 216 """ 217 classpath = classpath or DEFAULT_JDBC_CLASSPATH 218 super().__init__( 219 self.URN, 220 NamedTupleBasedPayloadBuilder( 221 JdbcConfigSchema( 222 location=table_name, 223 config=RowCoder( 224 typing_to_runner_api(Config).row_type.schema).encode( 225 Config( 226 driver_class_name=driver_class_name, 227 jdbc_url=jdbc_url, 228 username=username, 229 password=password, 230 connection_properties=connection_properties, 231 connection_init_sqls=connection_init_sqls, 232 write_statement=statement, 233 read_query=None, 234 fetch_size=None, 235 output_parallelization=None, 236 autosharding=autosharding, 237 max_connections=max_connections, 238 driver_jars=driver_jars, 239 partitions=None, 240 partition_column=None))), 241 ), 242 expansion_service or default_io_expansion_service(classpath), 243 ) 244 245 246 class ReadFromJdbc(ExternalTransform): 247 """A PTransform which reads Rows from the specified database via JDBC. 248 249 This transform delivers Rows defined as NamedTuple registered in 250 the coders registry, e.g.:: 251 252 ExampleRow = typing.NamedTuple('ExampleRow', 253 [('id', int), ('name', unicode)]) 254 coders.registry.register_coder(ExampleRow, coders.RowCoder) 255 256 with TestPipeline() as p: 257 result = ( 258 p 259 | 'Read from jdbc' >> ReadFromJdbc( 260 table_name='jdbc_external_test_read' 261 driver_class_name='org.postgresql.Driver', 262 jdbc_url='jdbc:postgresql://localhost:5432/example', 263 username='postgres', 264 password='postgres', 265 )) 266 267 table_name is a required paramater, and by default, the read_query is 268 generated from it. 269 270 The generated read_query can be overridden by passing in a read_query. 271 272 Experimental; no backwards compatibility guarantees. 273 """ 274 275 URN = 'beam:transform:org.apache.beam:schemaio_jdbc_read:v1' 276 277 def __init__( 278 self, 279 table_name, 280 driver_class_name, 281 jdbc_url, 282 username, 283 password, 284 query=None, 285 output_parallelization=None, 286 fetch_size=None, 287 partition_column=None, 288 partitions=None, 289 connection_properties=None, 290 connection_init_sqls=None, 291 max_connections=None, 292 driver_jars=None, 293 expansion_service=None, 294 classpath=None, 295 ): 296 """ 297 Initializes a read operation from Jdbc. 298 299 :param driver_class_name: name of the jdbc driver class 300 :param jdbc_url: full jdbc url to the database. 301 :param username: database username 302 :param password: database password 303 :param query: sql query to be executed 304 :param output_parallelization: is output parallelization on 305 :param fetch_size: how many rows to fetch 306 :param partition_column: enable partitioned reads by splitting on this 307 column 308 :param partitions: override the default number of splits when using 309 partition_column 310 :param connection_properties: properties of the jdbc connection 311 passed as string with format 312 [propertyName=property;]* 313 :param connection_init_sqls: required only for MySql and MariaDB. 314 passed as list of strings 315 :param max_connections: sets the maximum total number of connections. 316 use a negative value for no limit. 317 :param driver_jars: comma separated paths for JDBC drivers. if not 318 specified, the default classloader is used to load the 319 driver jars. 320 :param expansion_service: The address (host:port) of the ExpansionService. 321 :param classpath: A list of JARs or Java packages to include in the 322 classpath for the expansion service. This option is 323 usually needed for `jdbc` to include extra JDBC driver 324 packages. 325 The packages can be in these three formats: (1) A local 326 file, (2) A URL, (3) A gradle-style identifier of a Maven 327 package (e.g. "org.postgresql:postgresql:42.3.1"). 328 By default, this argument includes a Postgres SQL JDBC 329 driver. 330 """ 331 classpath = classpath or DEFAULT_JDBC_CLASSPATH 332 super().__init__( 333 self.URN, 334 NamedTupleBasedPayloadBuilder( 335 JdbcConfigSchema( 336 location=table_name, 337 config=RowCoder( 338 typing_to_runner_api(Config).row_type.schema).encode( 339 Config( 340 driver_class_name=driver_class_name, 341 jdbc_url=jdbc_url, 342 username=username, 343 password=password, 344 connection_properties=connection_properties, 345 connection_init_sqls=connection_init_sqls, 346 write_statement=None, 347 read_query=query, 348 fetch_size=fetch_size, 349 output_parallelization=output_parallelization, 350 autosharding=None, 351 max_connections=max_connections, 352 driver_jars=driver_jars, 353 partition_column=partition_column, 354 partitions=partitions))), 355 ), 356 expansion_service or default_io_expansion_service(classpath), 357 )