google_dataproc_cluster

Manages a Cloud Dataproc cluster resource within GCP.

Example Usage - Basic

resource "google_dataproc_cluster" "simplecluster" {
  name   = "simplecluster"
  region = "us-central1"
}

Example Usage - Advanced

resource "google_service_account" "default" {
  account_id   = "service-account-id"
  display_name = "Service Account"
}

resource "google_dataproc_cluster" "mycluster" {
  name     = "mycluster"
  region   = "us-central1"
  graceful_decommission_timeout = "120s"
  labels = {
    foo = "bar"
  }

  cluster_config {
    staging_bucket = "dataproc-staging-bucket"

    master_config {
      num_instances = 1
      machine_type  = "e2-medium"
      disk_config {
        boot_disk_type    = "pd-ssd"
        boot_disk_size_gb = 30
      }
    }

    worker_config {
      num_instances    = 2
      machine_type     = "e2-medium"
      min_cpu_platform = "Intel Skylake"
      disk_config {
        boot_disk_size_gb = 30
        num_local_ssds    = 1
      }
    }

    preemptible_worker_config {
      num_instances = 0
    }

    # Override or set some custom properties
    software_config {
      image_version = "2.0.35-debian10"
      override_properties = {
        "dataproc:dataproc.allow.zero.workers" = "true"
      }
    }

    gce_cluster_config {
      tags = ["foo", "bar"]
      # Google recommends custom service accounts that have cloud-platform scope and permissions granted via IAM Roles.
      service_account = google_service_account.default.email
      service_account_scopes = [
        "cloud-platform"
      ]
    }

    # You can define multiple initialization_action blocks
    initialization_action {
      script      = "gs://dataproc-initialization-actions/stackdriver/stackdriver.sh"
      timeout_sec = 500
    }
  }
}

Example Usage - Using a GPU accelerator

resource "google_dataproc_cluster" "accelerated_cluster" {
  name   = "my-cluster-with-gpu"
  region = "us-central1"

  cluster_config {
    gce_cluster_config {
      zone = "us-central1-a"
    }

    master_config {
      accelerators {
        accelerator_type  = "nvidia-tesla-k80"
        accelerator_count = "1"
      }
    }
  }
}

Argument Reference



The virtual_cluster_config block supports:

virtual_cluster_config {
        auxiliary_services_config { ... }
        kubernetes_cluster_config { ... }
    }
    


The auxiliary_services_config block supports:

virtual_cluster_config {
      auxiliary_services_config {
        metastore_config {
          dataproc_metastore_service = google_dataproc_metastore_service.metastore_service.id
        }
 
spark_history_server_config { dataproc_cluster = google_dataproc_cluster.dataproc_cluster.id } } }


The kubernetes_cluster_config block supports:

virtual_cluster_config {
      kubernetes_cluster_config {
        kubernetes_namespace = "foobar"
 
kubernetes_software_config { component_version = { "SPARK" : "3.1-dataproc-7" }
properties = { "spark:spark.eventLog.enabled": "true" } }
gke_cluster_config { gke_cluster_target = google_container_cluster.primary.id
node_pool_target { node_pool = "dpgke" roles = ["DEFAULT"]
node_pool_config { autoscaling { min_node_count = 1 max_node_count = 6 }
config { machine_type = "n1-standard-4" preemptible = true local_ssd_count = 1 min_cpu_platform = "Intel Sandy Bridge" }
locations = ["us-central1-c"] } } } } }


The cluster_config block supports:

cluster_config {
        gce_cluster_config        { ... }
        master_config             { ... }
        worker_config             { ... }
        preemptible_worker_config { ... }
        software_config           { ... }
 
# You can define multiple initialization_action blocks initialization_action { ... } encryption_config { ... } endpoint_config { ... } metastore_config { ... } }


The cluster_config.gce_cluster_config block supports:

cluster_config {
    gce_cluster_config {
      zone = "us-central1-a"
 
# One of the below to hook into a custom network / subnetwork network = google_compute_network.dataproc_network.name subnetwork = google_compute_network.dataproc_subnetwork.name
tags = ["foo", "bar"] } }


The cluster_config.gce_cluster_config.shielded_instance_config block supports:

cluster_config{
  gce_cluster_config{
    shielded_instance_config{
      enable_secure_boot          = true
      enable_vtpm                 = true
      enable_integrity_monitoring = true
    }
  }
}

The cluster_config.master_config block supports:

cluster_config {
  master_config {
    num_instances    = 1
    machine_type     = "e2-medium"
    min_cpu_platform = "Intel Skylake"

    disk_config {
      boot_disk_type    = "pd-ssd"
      boot_disk_size_gb = 30
      num_local_ssds    = 1
    }
  }
}

The cluster_config.worker_config block supports:

cluster_config {
  worker_config {
    num_instances    = 3
    machine_type     = "e2-medium"
    min_cpu_platform = "Intel Skylake"
    min_num_instance = 2
    disk_config {
      boot_disk_type    = "pd-standard"
      boot_disk_size_gb = 30
      num_local_ssds    = 1
    }
  }
}

The cluster_config.preemptible_worker_config block supports:

cluster_config {
  preemptible_worker_config {
    num_instances = 1

    disk_config {
      boot_disk_type    = "pd-standard"
      boot_disk_size_gb = 30
      num_local_ssds    = 1
    }
    instance_flexibility_policy {
      instance_selection_list {
        machine_types = ["n2-standard-2","n1-standard-2"]
        rank          = 1
      }
      instance_selection_list {
        machine_types = ["n2d-standard-2"]
        rank          = 3
      }
    }
  }
}

Note: Unlike worker_config, you cannot set the machine_type value directly. This will be set for you based on whatever was set for the worker_config.machine_type value.


The cluster_config.software_config block supports:

cluster_config {
  # Override or set some custom properties
  software_config {
    image_version = "2.0.35-debian10"

    override_properties = {
      "dataproc:dataproc.allow.zero.workers" = "true"
    }
  }
}

The cluster_config.security_config block supports:

cluster_config {
  # Override or set some custom properties
  security_config {
    kerberos_config {
      kms_key_uri = "projects/projectId/locations/locationId/keyRings/keyRingId/cryptoKeys/keyId"
      root_principal_password_uri = "bucketId/o/objectId"
    }
  }
}

The cluster_config.autoscaling_config block supports:

cluster_config {
  # Override or set some custom properties
  autoscaling_config {
    policy_uri = "projects/projectId/locations/region/autoscalingPolicies/policyId"
  }
}

Only resource names including projectid and location (region) are valid. Examples:

https://www.googleapis.com/compute/v1/projects/[projectId]/locations/[dataproc_region]/autoscalingPolicies/[policy_id] projects/[projectId]/locations/[dataproc_region]/autoscalingPolicies/[policy_id] Note that the policy must be in the same project and Cloud Dataproc region.


The initialization_action block (Optional) can be specified multiple times and supports:

cluster_config {
  # You can define multiple initialization_action blocks
  initialization_action {
    script      = "gs://dataproc-initialization-actions/stackdriver/stackdriver.sh"
    timeout_sec = 500
  }
}

The encryption_config block supports:

cluster_config {
  encryption_config {
    kms_key_name = "projects/projectId/locations/region/keyRings/keyRingName/cryptoKeys/keyName"
  }
}

The dataproc_metric_config block supports:

dataproc_metric_config {
      metrics {
        metric_source = "HDFS"
        metric_overrides = ["yarn:ResourceManager:QueueMetrics:AppsCompleted"]
      }
    }

The auxiliary_node_groups block supports:

auxiliary_node_groups{
  node_group {
    roles = ["DRIVER"]
    node_group_config{
      num_instances=2
      machine_type="n1-standard-2"
      min_cpu_platform = "AMD Rome"
      disk_config {
        boot_disk_size_gb = 35
        boot_disk_type = "pd-standard"
        num_local_ssds = 1
      }
      accelerators {
        accelerator_count = 1
        accelerator_type  = "nvidia-tesla-t4"
      }
    }
  }
}

The lifecycle_config block supports:

cluster_config {
  lifecycle_config {
    idle_delete_ttl = "10m"
    auto_delete_time = "2120-01-01T12:00:00.01Z"
  }
}

The endpoint_config block (Optional, Computed, Beta) supports:

cluster_config {
  endpoint_config {
    enable_http_port_access = true
  }
}

The metastore_config block (Optional, Computed, Beta) supports:

cluster_config {
  metastore_config {
    dataproc_metastore_service = "projects/projectId/locations/region/services/serviceName"
  }
}

Only resource names including projectid and location (region) are valid. Examples:

projects/[projectId]/locations/[dataproc_region]/services/[service-name]

Attributes Reference

In addition to the arguments listed above, the following computed attributes are exported:

Import

This resource does not support import.

Timeouts

This resource provides the following Timeouts configuration options: configuration options: