Source code for airflow.contrib.hooks.gcp_transfer_hook

# -*- coding: utf-8 -*-
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""
This module contains a Google Storage Transfer Service Hook.
"""

import json
import time
import warnings
from copy import deepcopy
from datetime import timedelta
from typing import Dict, List, Tuple, Union, Optional

from googleapiclient.discovery import build

from airflow.exceptions import AirflowException
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook

# Time to sleep between active checks of the operation results
[docs]TIME_TO_SLEEP_IN_SECONDS = 10
[docs]class GcpTransferJobsStatus: """ Class with GCP Transfer jobs statuses. """
[docs] ENABLED = "ENABLED"
[docs] DISABLED = "DISABLED"
[docs] DELETED = "DELETED"
[docs]class GcpTransferOperationStatus: """ Class with GCP Transfer operations statuses. """
[docs] IN_PROGRESS = "IN_PROGRESS"
[docs] PAUSED = "PAUSED"
[docs] SUCCESS = "SUCCESS"
[docs] FAILED = "FAILED"
[docs] ABORTED = "ABORTED"
# A list of keywords used to build a request or response
[docs]ACCESS_KEY_ID = "accessKeyId"
[docs]ALREADY_EXISTING_IN_SINK = "overwriteObjectsAlreadyExistingInSink"
[docs]AWS_ACCESS_KEY = "awsAccessKey"
[docs]AWS_S3_DATA_SOURCE = 'awsS3DataSource'
[docs]BODY = 'body'
[docs]BUCKET_NAME = 'bucketName'
[docs]DAY = 'day'
[docs]DESCRIPTION = "description"
[docs]FILTER = 'filter'
[docs]FILTER_JOB_NAMES = 'job_names'
[docs]FILTER_PROJECT_ID = 'project_id'
[docs]GCS_DATA_SINK = 'gcsDataSink'
[docs]GCS_DATA_SOURCE = 'gcsDataSource'
[docs]HOURS = "hours"
[docs]HTTP_DATA_SOURCE = 'httpDataSource'
[docs]LIST_URL = 'list_url'
[docs]METADATA = 'metadata'
[docs]MINUTES = "minutes"
[docs]MONTH = 'month'
[docs]NAME = 'name'
[docs]OBJECT_CONDITIONS = 'object_conditions'
[docs]OPERATIONS = 'operations'
[docs]PROJECT_ID = 'projectId'
[docs]SCHEDULE = 'schedule'
[docs]SCHEDULE_END_DATE = 'scheduleEndDate'
[docs]SCHEDULE_START_DATE = 'scheduleStartDate'
[docs]SECONDS = "seconds"
[docs]SECRET_ACCESS_KEY = "secretAccessKey"
[docs]START_TIME_OF_DAY = 'startTimeOfDay'
[docs]STATUS = "status"
[docs]STATUS1 = 'status'
[docs]TRANSFER_JOB = 'transfer_job'
[docs]TRANSFER_JOB_FIELD_MASK = 'update_transfer_job_field_mask'
[docs]TRANSFER_JOBS = 'transferJobs'
[docs]TRANSFER_OPERATIONS = 'transferOperations'
[docs]TRANSFER_OPTIONS = 'transfer_options'
[docs]TRANSFER_SPEC = 'transferSpec'
[docs]YEAR = 'year'
[docs]NEGATIVE_STATUSES = {GcpTransferOperationStatus.FAILED, GcpTransferOperationStatus.ABORTED}
# noinspection PyAbstractClass
[docs]class GCPTransferServiceHook(GoogleCloudBaseHook): """ Hook for Google Storage Transfer Service. All the methods in the hook where project_id is used must be called with keyword arguments rather than positional. """ def __init__( self, api_version: str = 'v1', gcp_conn_id: str = 'google_cloud_default', delegate_to: str = None ) -> None: super().__init__(gcp_conn_id, delegate_to) self.api_version = api_version self.num_retries = self._get_field('num_retries', 5) # type: int self._conn = None
[docs] def get_conn(self): """ Retrieves connection to Google Storage Transfer service. :return: Google Storage Transfer service object :rtype: dict """ if not self._conn: http_authorized = self._authorize() self._conn = build( 'storagetransfer', self.api_version, http=http_authorized, cache_discovery=False ) return self._conn
@GoogleCloudBaseHook.catch_http_exception
[docs] def create_transfer_job(self, body: Dict) -> Dict: """ Creates a transfer job that runs periodically. :param body: (Required) A request body, as described in https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferJobs/patch#request-body :type body: dict :return: transfer job. See: https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferJobs#TransferJob :rtype: dict """ body = self._inject_project_id(body, BODY, PROJECT_ID) return self.get_conn().transferJobs().create(body=body).execute( # pylint: disable=no-member num_retries=self.num_retries)
@GoogleCloudBaseHook.fallback_to_default_project_id @GoogleCloudBaseHook.catch_http_exception
[docs] def get_transfer_job(self, job_name: str, project_id: str = None) -> Dict: """ Gets the latest state of a long-running operation in Google Storage Transfer Service. :param job_name: (Required) Name of the job to be fetched :type job_name: str :param project_id: (Optional) the ID of the project that owns the Transfer Job. If set to None or missing, the default project_id from the GCP connection is used. :type project_id: str :return: Transfer Job :rtype: dict """ assert project_id is not None return ( self.get_conn() # pylint: disable=no-member .transferJobs() .get(jobName=job_name, projectId=project_id) .execute(num_retries=self.num_retries)
)
[docs] def list_transfer_job(self, request_filter: Dict = None, **kwargs) -> List[Dict]: """ Lists long-running operations in Google Storage Transfer Service that match the specified filter. :param request_filter: (Required) A request filter, as described in https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferJobs/list#body.QUERY_PARAMETERS.filter :type request_filter: dict :return: List of Transfer Jobs :rtype: list[dict] """ # To preserve backward compatibility # TODO: remove one day if request_filter is None: if 'filter' in kwargs: request_filter = kwargs['filter'] assert isinstance(request_filter, Dict) warnings.warn("Use 'request_filter' instead of 'filter'", DeprecationWarning) else: raise TypeError("list_transfer_job missing 1 required positional argument: 'request_filter'") conn = self.get_conn() request_filter = self._inject_project_id(request_filter, FILTER, FILTER_PROJECT_ID) request = conn.transferJobs().list(filter=json.dumps(request_filter)) # pylint: disable=no-member jobs = [] # type: List[Dict] while request is not None: response = request.execute(num_retries=self.num_retries) jobs.extend(response[TRANSFER_JOBS]) request = conn.transferJobs().list_next(previous_request=request, # pylint: disable=no-member previous_response=response) return jobs
@GoogleCloudBaseHook.catch_http_exception
[docs] def update_transfer_job(self, job_name: str, body: Dict) -> Dict: """ Updates a transfer job that runs periodically. :param job_name: (Required) Name of the job to be updated :type job_name: str :param body: A request body, as described in https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferJobs/patch#request-body :type body: dict :return: If successful, TransferJob. :rtype: dict """ body = self._inject_project_id(body, BODY, PROJECT_ID) return ( self.get_conn() # pylint: disable=no-member .transferJobs() .patch(jobName=job_name, body=body) .execute(num_retries=self.num_retries)
) @GoogleCloudBaseHook.fallback_to_default_project_id @GoogleCloudBaseHook.catch_http_exception
[docs] def delete_transfer_job(self, job_name: str, project_id: str = None) -> None: """ Deletes a transfer job. This is a soft delete. After a transfer job is deleted, the job and all the transfer executions are subject to garbage collection. Transfer jobs become eligible for garbage collection 30 days after soft delete. :param job_name: (Required) Name of the job to be deleted :type job_name: str :param project_id: (Optional) the ID of the project that owns the Transfer Job. If set to None or missing, the default project_id from the GCP connection is used. :type project_id: str :rtype: None """ assert project_id is not None ( self.get_conn() # pylint: disable=no-member .transferJobs() .patch( jobName=job_name, body={ PROJECT_ID: project_id, TRANSFER_JOB: {STATUS1: GcpTransferJobsStatus.DELETED}, TRANSFER_JOB_FIELD_MASK: STATUS1, }, ) .execute(num_retries=self.num_retries)
) @GoogleCloudBaseHook.catch_http_exception
[docs] def cancel_transfer_operation(self, operation_name: str) -> None: """ Cancels an transfer operation in Google Storage Transfer Service. :param operation_name: Name of the transfer operation. :type operation_name: str :rtype: None """ self.get_conn().transferOperations().cancel( # pylint: disable=no-member name=operation_name).execute(num_retries=self.num_retries)
@GoogleCloudBaseHook.catch_http_exception
[docs] def get_transfer_operation(self, operation_name: str) -> Dict: """ Gets an transfer operation in Google Storage Transfer Service. :param operation_name: (Required) Name of the transfer operation. :type operation_name: str :return: transfer operation See: https://cloud.google.com/storage-transfer/docs/reference/rest/v1/Operation :rtype: dict """ return ( self.get_conn() # pylint: disable=no-member .transferOperations() .get(name=operation_name) .execute(num_retries=self.num_retries)
) @GoogleCloudBaseHook.catch_http_exception
[docs] def list_transfer_operations(self, request_filter: Dict = None, **kwargs) -> List[Dict]: """ Gets an transfer operation in Google Storage Transfer Service. :param request_filter: (Required) A request filter, as described in https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferJobs/list#body.QUERY_PARAMETERS.filter With one additional improvement: * project_id is optional if you have a project id defined in the connection See: :ref:`howto/connection:gcp` :type request_filter: dict :return: transfer operation :rtype: list[dict] """ # To preserve backward compatibility # TODO: remove one day if request_filter is None: if 'filter' in kwargs: request_filter = kwargs['filter'] assert isinstance(request_filter, Dict) warnings.warn("Use 'request_filter' instead of 'filter'", DeprecationWarning) else: raise TypeError( "list_transfer_operations missing 1 required positional argument: 'request_filter'" ) conn = self.get_conn() request_filter = self._inject_project_id(request_filter, FILTER, FILTER_PROJECT_ID) operations = [] # type: List[Dict] request = conn.transferOperations().list( # pylint: disable=no-member name=TRANSFER_OPERATIONS, filter=json.dumps(request_filter)) while request is not None: response = request.execute(num_retries=self.num_retries) if OPERATIONS in response: operations.extend(response[OPERATIONS]) request = conn.transferOperations().list_next( # pylint: disable=no-member previous_request=request, previous_response=response ) return operations
@GoogleCloudBaseHook.catch_http_exception
[docs] def pause_transfer_operation(self, operation_name: str): """ Pauses an transfer operation in Google Storage Transfer Service. :param operation_name: (Required) Name of the transfer operation. :type operation_name: str :rtype: None """ self.get_conn().transferOperations().pause( # pylint: disable=no-member name=operation_name).execute(num_retries=self.num_retries)
@GoogleCloudBaseHook.catch_http_exception
[docs] def resume_transfer_operation(self, operation_name: str): """ Resumes an transfer operation in Google Storage Transfer Service. :param operation_name: (Required) Name of the transfer operation. :type operation_name: str :rtype: None """ self.get_conn().transferOperations().resume( # pylint: disable=no-member name=operation_name).execute(num_retries=self.num_retries)
@GoogleCloudBaseHook.catch_http_exception
[docs] def wait_for_transfer_job( self, job: Dict, expected_statuses: Tuple[str] = (GcpTransferOperationStatus.SUCCESS,), timeout: Optional[Union[float, timedelta]] = None ) -> None: """ Waits until the job reaches the expected state. :param job: Transfer job See: https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferJobs#TransferJob :type job: dict :param expected_statuses: State that is expected See: https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferOperations#Status :type expected_statuses: set[str] :param timeout: Time in which the operation must end in seconds. If not specified, defaults to 60 seconds. :type timeout: Optional[Union[float, timedelta]] :rtype: None """ if timeout is None: timeout = 60 elif isinstance(timeout, timedelta): timeout = timeout.total_seconds() start_time = time.time() while time.time() - start_time < timeout: operations = self.list_transfer_operations( request_filter={FILTER_PROJECT_ID: job[PROJECT_ID], FILTER_JOB_NAMES: [job[NAME]]} ) if GCPTransferServiceHook.operations_contain_expected_statuses(operations, expected_statuses): return time.sleep(TIME_TO_SLEEP_IN_SECONDS) raise AirflowException("Timeout. The operation could not be completed within the allotted time.")
[docs] def _inject_project_id(self, body: Dict, param_name: str, target_key: str) -> Dict: body = deepcopy(body) body[target_key] = body.get(target_key, self.project_id) if not body.get(target_key): raise AirflowException( "The project id must be passed either as `{}` key in `{}` parameter or as project_id " "extra in GCP connection definition. Both are not set!".format(target_key, param_name) ) return body
@staticmethod
[docs] def operations_contain_expected_statuses( operations: List[Dict], expected_statuses: Union[Tuple[str], str] ) -> bool: """ Checks whether the operation list has an operation with the expected status, then returns true If it encounters operations in FAILED or ABORTED state throw :class:`airflow.exceptions.AirflowException`. :param operations: (Required) List of transfer operations to check. :type operations: list[dict] :param expected_statuses: (Required) status that is expected See: https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferOperations#Status :type expected_statuses: set[str] :return: If there is an operation with the expected state in the operation list, returns true, :raises: airflow.exceptions.AirflowException If it encounters operations with a state in the list, :rtype: bool """ expected_statuses_set = ( {expected_statuses} if isinstance(expected_statuses, str) else set(expected_statuses) ) if not operations: return False current_statuses = {operation[METADATA][STATUS] for operation in operations} if len(current_statuses - expected_statuses_set) != len(current_statuses): return True if len(NEGATIVE_STATUSES - current_statuses) != len(NEGATIVE_STATUSES): raise AirflowException( 'An unexpected operation status was encountered. Expected: {}'.format( ", ".join(expected_statuses_set) ) ) return False