go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cipd/appengine/verification/service-verification.yaml (about)

     1  service: verification
     2  runtime: go121
     3  
     4  luci_gae_vars:
     5    chrome-infra-packages-dev:
     6      VERIFICATION_INSTANCE_CLASS: B2
     7      GCS_BUCKET: chrome-infra-packages-dev.appspot.com
     8      AUTH_SERVICE_HOST: chrome-infra-auth-dev.appspot.com
     9      CONFIG_SERVICE_HOST: config.luci.app
    10      TS_MON_ACCOUNT: app-engine-metric-publishers@prodx-mon-chrome-infra.google.com.iam.gserviceaccount.com
    11      VPC_CONNECTOR: projects/chrome-infra-packages-dev/locations/us-central1/connectors/connector
    12      REDIS_ADDR: 10.248.220.35:6379
    13      DS_CACHE: redis
    14  
    15  instance_class: ${VERIFICATION_INSTANCE_CLASS}
    16  
    17  # NOTE: Ideally this should be auto-scaled, but auto-scaled instances still
    18  # have a maximum request duration of 10 minutes, which is insufficient for
    19  # verification.
    20  #
    21  # So, we switched this to basic scaling, but AppEngine hard-codes a "10
    22  # concurrent requests" scale-up signal for basic scaling, which means that
    23  # verification backends will be trying to handle 10 very beefy verification
    24  # requests and timing out, but AppEngine will keep them all in one instance.
    25  #
    26  # Finally, we have arrived at the bottom of the barrel here - manual scaling.
    27  # Inconveniently, manually scaled instances are for EVERY uploaded version. This
    28  # means that our deployment tooling has to be set to keep a maximum of 1 extra
    29  # version around, or we will end up with hundreds of verification backend
    30  # instances sitting completely idle.
    31  #
    32  # Probably the right long term fix here would be to move the upload process to
    33  # go through e.g. CloudRun where the scaling characteristics are uncopuled from
    34  # maximum request duration. This would allow us to calculate the sha256 (or any
    35  # future hashes) on the fly during the upload process, which would also simplify
    36  # the `cipd` client code, since upload returning `200 OK` could be followed
    37  # immediately by other steps in the package registration process, rather than
    38  # waiting for the verification backend to do it's work asynchronously.
    39  #
    40  # Really ideally GCS could implement e.g. sha256 object hashing and then we
    41  # could completely delete the verification backend :/
    42  manual_scaling:
    43    instances: 10
    44  
    45  vpc_access_connector:
    46    name: ${VPC_CONNECTOR}
    47  
    48  inbound_services:
    49  - warmup
    50  
    51  entrypoint: >
    52    main
    53    -auth-service-host ${AUTH_SERVICE_HOST}
    54    -config-service-host ${CONFIG_SERVICE_HOST}
    55    -ts-mon-account ${TS_MON_ACCOUNT}
    56    -redis-addr ${REDIS_ADDR}
    57    -ds-cache ${DS_CACHE}
    58    -dsmapper-mapper-queue mappers
    59    -dsmapper-control-queue default
    60    -internal-request-timeout 60m
    61    -cipd-storage-gs-path /${GCS_BUCKET}/store
    62    -cipd-temp-gs-path /${GCS_BUCKET}/temp
    63    -random-secrets-in-datastore
    64    -cloud-error-reporting
    65    -bqlog-dataset cipd