google_dataproc_job

Manages a job resource within a Dataproc cluster within GCE. For more information see the official dataproc documentation.

Example usage

resource "google_dataproc_cluster" "mycluster" {
  name   = "dproc-cluster-unique-name"
  region = "us-central1"
}

# Submit an example spark job to a dataproc cluster
resource "google_dataproc_job" "spark" {
  region       = google_dataproc_cluster.mycluster.region
  force_delete = true
  placement {
    cluster_name = google_dataproc_cluster.mycluster.name
  }

  spark_config {
    main_class    = "org.apache.spark.examples.SparkPi"
    jar_file_uris = ["file:///usr/lib/spark/examples/jars/spark-examples.jar"]
    args          = ["1000"]

    properties = {
      "spark.logConf" = "true"
    }

    logging_config {
      driver_log_levels = {
        "root" = "INFO"
      }
    }
  }
}

# Submit an example pyspark job to a dataproc cluster
resource "google_dataproc_job" "pyspark" {
  region       = google_dataproc_cluster.mycluster.region
  force_delete = true
  placement {
    cluster_name = google_dataproc_cluster.mycluster.name
  }

  pyspark_config {
    main_python_file_uri = "gs://dataproc-examples-2f10d78d114f6aaec76462e3c310f31f/src/pyspark/hello-world/hello-world.py"
    properties = {
      "spark.logConf" = "true"
    }
  }
}

# Check out current state of the jobs
output "spark_status" {
  value = google_dataproc_job.spark.status[0].state
}

output "pyspark_status" {
  value = google_dataproc_job.pyspark.status[0].state
}

Argument Reference


The pyspark_config block supports:

Submitting a pyspark job to the cluster. Below is an example configuration:

# Submit a pyspark job to the cluster
resource "google_dataproc_job" "pyspark" {
  ...
  pyspark_config {
    main_python_file_uri = "gs://dataproc-examples-2f10d78d114f6aaec76462e3c310f31f/src/pyspark/hello-world/hello-world.py"
    properties = {
      "spark.logConf" = "true"
    }
  }
}

For configurations requiring Hadoop Compatible File System (HCFS) references, the options below are generally applicable:

  - GCS files with the `gs://` prefix
  - HDFS files on the cluster with the `hdfs://` prefix
  - Local files on the cluster with the `file://` prefix

The spark_config block supports:

# Submit a spark job to the cluster
resource "google_dataproc_job" "spark" {
  ...
  spark_config {
    main_class    = "org.apache.spark.examples.SparkPi"
    jar_file_uris = ["file:///usr/lib/spark/examples/jars/spark-examples.jar"]
    args          = ["1000"]

    properties = {
      "spark.logConf" = "true"
    }

    logging_config {
      driver_log_levels = {
        "root" = "INFO"
      }
    }
  }
}

The hadoop_config block supports:

# Submit a hadoop job to the cluster
resource "google_dataproc_job" "hadoop" {
  ...
  hadoop_config {
    main_jar_file_uri = "file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar"
    args = [
      "wordcount",
      "file:///usr/lib/spark/NOTICE",
      "gs://${google_dataproc_cluster.basic.cluster_config[0].bucket}/hadoopjob_output",
    ]
  }
}

The hive_config block supports:

# Submit a hive job to the cluster
resource "google_dataproc_job" "hive" {
  ...
  hive_config {
    query_list = [
      "DROP TABLE IF EXISTS dprocjob_test",
      "CREATE EXTERNAL TABLE dprocjob_test(bar int) LOCATION 'gs://${google_dataproc_cluster.basic.cluster_config[0].bucket}/hive_dprocjob_test/'",
      "SELECT * FROM dprocjob_test WHERE bar > 2",
    ]
  }
}

The pig_config block supports:

# Submit a pig job to the cluster
resource "google_dataproc_job" "pig" {
  ...
  pig_config {
    query_list = [
      "LNS = LOAD 'file:///usr/lib/pig/LICENSE.txt ' AS (line)",
      "WORDS = FOREACH LNS GENERATE FLATTEN(TOKENIZE(line)) AS word",
      "GROUPS = GROUP WORDS BY word",
      "WORD_COUNTS = FOREACH GROUPS GENERATE group, COUNT(WORDS)",
      "DUMP WORD_COUNTS",
    ]
  }
}

The sparksql_config block supports:

# Submit a spark SQL job to the cluster
resource "google_dataproc_job" "sparksql" {
  ...
  sparksql_config {
    query_list = [
      "DROP TABLE IF EXISTS dprocjob_test",
      "CREATE TABLE dprocjob_test(bar int)",
      "SELECT * FROM dprocjob_test WHERE bar > 2",
    ]
  }
}

The presto_config block supports:

# Submit a Presto job to the cluster
resource "google_dataproc_job" "presto" {
  ...
  presto_config {
    query_list = [
      "DROP TABLE IF EXISTS dprocjob_test",
      "CREATE TABLE dprocjob_test(bar int)",
      "SELECT * FROM dprocjob_test WHERE bar > 2",
    ]
  }
}

Attributes Reference

In addition to the arguments listed above, the following computed attributes are exported:

Import

This resource does not support import.

Timeouts

google_dataproc_cluster provides the following Timeouts configuration options: configuration options: