Managing Datasets#
A dataset represents a collection of tables, and applies several default policies to tables as they are created:
An access control list (ACL). When created, a dataset has an ACL which maps to the ACL inherited from its project.
A default table expiration period. If set, tables created within the dataset will have the value as their expiration period.
See BigQuery documentation for more information on Datasets.
Listing Datasets#
List datasets for a project with the
list_datasets()
method:
# TODO(developer): Import the client library.
# from google.cloud import bigquery
# TODO(developer): Construct a BigQuery client object.
# client = bigquery.Client()
datasets = list(client.list_datasets()) # Make an API request.
project = client.project
if datasets:
print("Datasets in project {}:".format(project))
for dataset in datasets:
print("\t{}".format(dataset.dataset_id))
else:
print("{} project does not contain any datasets.".format(project))
List datasets by label for a project with the
list_datasets()
method:
# TODO(developer): Import the client library.
# from google.cloud import bigquery
# TODO(developer): Construct a BigQuery client object.
# client = bigquery.Client()
label_filter = "labels.color:green"
datasets = list(client.list_datasets(filter=label_filter)) # Make an API request.
if datasets:
print("Datasets filtered by {}:".format(label_filter))
for dataset in datasets:
print("\t{}.{}".format(dataset.project, dataset.dataset_id))
else:
print("No datasets found with this filter.")
Getting a Dataset#
Get a dataset resource (to pick up changes made by another client) with the
get_dataset()
method:
# TODO(developer): Import the client library.
# from google.cloud import bigquery
# TODO(developer): Construct a BigQuery client object.
# client = bigquery.Client()
# TODO(developer): Set dataset_id to the ID of the dataset to fetch.
# dataset_id = 'your-project.your_dataset'
dataset = client.get_dataset(dataset_id) # Make an API request.
full_dataset_id = "{}.{}".format(dataset.project, dataset.dataset_id)
friendly_name = dataset.friendly_name
print(
"Got dataset '{}' with friendly_name '{}'.".format(
full_dataset_id, friendly_name
)
)
# View dataset properties
print("Description: {}".format(dataset.description))
print("Labels:")
labels = dataset.labels
if labels:
for label, value in labels.items():
print("\t{}: {}".format(label, value))
else:
print("\tDataset has no labels defined.")
# View tables in dataset
print("Tables:")
tables = list(client.list_tables(dataset)) # API request(s)
if tables:
for table in tables:
print("\t{}".format(table.table_id))
else:
print("\tThis dataset does not contain any tables.")
Determine if a dataset exists with the
get_dataset()
method:
from google.cloud.exceptions import NotFound
# TODO(developer): Set dataset_id to the ID of the dataset to determine existence.
# dataset_id = "your-project.your_dataset"
try:
client.get_dataset(dataset_id) # Make an API request.
print("Dataset {} already exists".format(dataset_id))
except NotFound:
print("Dataset {} is not found".format(dataset_id))
Creating a Dataset#
Create a new dataset with the
create_dataset()
method:
from google.cloud import bigquery
# TODO(developer): Construct a BigQuery client object.
# client = bigquery.Client()
# TODO(developer): Set dataset_id to the ID of the dataset to create.
# dataset_id = "{}.your_dataset".format(client.project)
# Construct a full Dataset object to send to the API.
dataset = bigquery.Dataset(dataset_id)
# TODO(developer): Specify the geographic location where the dataset should reside.
dataset.location = "US"
# Send the dataset to the API for creation.
# Raises google.api_core.exceptions.Conflict if the Dataset already
# exists within the project.
dataset = client.create_dataset(dataset) # Make an API request.
print("Created dataset {}.{}".format(client.project, dataset.dataset_id))
Updating a Dataset#
Update a property in a dataset’s metadata with the
update_dataset()
method:
# TODO(developer): Import the client library.
# from google.cloud import bigquery
# TODO(developer): Construct a BigQuery client object.
# client = bigquery.Client()
# TODO(developer): Set dataset_id to the ID of the dataset to fetch.
# dataset_id = 'your-project.your_dataset'
dataset = client.get_dataset(dataset_id) # Make an API request.
dataset.description = "Updated description."
dataset = client.update_dataset(dataset, ["description"]) # Make an API request.
full_dataset_id = "{}.{}".format(dataset.project, dataset.dataset_id)
print(
"Updated dataset '{}' with description '{}'.".format(
full_dataset_id, dataset.description
)
)
Modify user permissions on a dataset with the
update_dataset()
method:
from google.cloud import bigquery
# TODO(developer): Construct a BigQuery client object.
# client = bigquery.Client()
# TODO(developer): Set dataset_id to the ID of the dataset to fetch.
# dataset_id = 'your-project.your_dataset'
dataset = client.get_dataset(dataset_id) # Make an API request.
entry = bigquery.AccessEntry(
role="READER",
entity_type="userByEmail",
entity_id="sample.bigquery.dev@gmail.com",
)
entries = list(dataset.access_entries)
entries.append(entry)
dataset.access_entries = entries
dataset = client.update_dataset(dataset, ["access_entries"]) # Make an API request.
full_dataset_id = "{}.{}".format(dataset.project, dataset.dataset_id)
print(
"Updated dataset '{}' with modified user permissions.".format(full_dataset_id)
)
Manage Dataset labels#
Add labels to a dataset with the
update_dataset()
method:
# TODO(developer): Import the client library.
# from google.cloud import bigquery
# TODO(developer): Construct a BigQuery client object.
# client = bigquery.Client()
# TODO(developer): Set dataset_id to the ID of the dataset to fetch.
# dataset_id = "your-project.your_dataset"
dataset = client.get_dataset(dataset_id) # Make an API request.
dataset.labels = {"color": "green"}
dataset = client.update_dataset(dataset, ["labels"]) # Make an API request.
print("Labels added to {}".format(dataset_id))
Get dataset’s labels with the
get_dataset()
method:
# TODO(developer): Import the client library.
# from google.cloud import bigquery
# TODO(developer): Construct a BigQuery client object.
# client = bigquery.Client()
# TODO(developer): Set dataset_id to the ID of the dataset to fetch.
# dataset_id = "your-project.your_dataset"
dataset = client.get_dataset(dataset_id) # Make an API request.
# View dataset labels
print("Dataset ID: {}".format(dataset_id))
print("Labels:")
if dataset.labels:
for label, value in dataset.labels.items():
print("\t{}: {}".format(label, value))
else:
print("\tDataset has no labels defined.")
Delete dataset’s labels with the
update_dataset()
method:
# TODO(developer): Import the client library.
# from google.cloud import bigquery
# TODO(developer): Construct a BigQuery client object.
# client = bigquery.Client()
# TODO(developer): Set dataset_id to the ID of the dataset to fetch.
# dataset_id = "your-project.your_dataset"
dataset = client.get_dataset(dataset_id) # Make an API request.
# To delete a label from a dataset, set its value to None
dataset.labels["color"] = None
dataset = client.update_dataset(dataset, ["labels"]) # Make an API request.
print("Labels deleted from {}".format(dataset_id))
Deleting a Dataset#
Delete a dataset with the
delete_dataset()
method:
# TODO(developer): Import the client library.
# from google.cloud import bigquery
# TODO(developer): Construct a BigQuery client object.
# client = bigquery.Client()
# TODO(developer): Set model_id to the ID of the model to fetch.
# dataset_id = 'your-project.your_dataset'
# Use the delete_contents parameter to delete a dataset and its contents
# Use the not_found_ok parameter to not receive an error if the dataset has already been deleted.
client.delete_dataset(
dataset_id, delete_contents=True, not_found_ok=True
) # Make an API request.
print("Deleted dataset '{}'.".format(dataset_id))