catalog = "<catalog_name>"
schema = "<schema_name>"
volume = "<volume_name>"
file_name = "new_baby_names.csv"
table_name = "baby_names"
path_volume = "/Volumes/" + catalog + "/" + schema + "/" + volume
path_table = catalog + "." + schema
print(path_table) # Show the complete path
print(path_volume) # Show the complete path
val catalog = "<catalog_name>"
val schema = "<schema_name>"
val volume = "<volume_name>"
val fileName = "new_baby_names.csv"
val tableName = "baby_names"
val pathVolume = s"/Volumes/${catalog}/${schema}/${volume}"
val pathTable = s"${catalog}.${schema}"
print(pathVolume) // Show the complete path
print(pathTable) // Show the complete path
catalog <- "<catalog_name>"
schema <- "<schema_name>"
volume <- "<volume_name>"
file_name <- "new_baby_names.csv"
table_name <- "baby_names"
path_volume <- paste0("/Volumes/", catalog, "/", schema, "/", volume, sep = "")
path_table <- paste0(catalog, ".", schema, sep = "")
print(path_volume) # Show the complete path
print(path_table) # Show the complete path
Step 3: Add new CSV file of data to your Unity Catalog volume
This step creates a DataFrame named df
with a new baby name for 2022 and then saves that data into a new CSV file in your Unity Catalog volume.
This step simulates adding new yearly data to the existing data loaded for previous years. In your production environment, this incremental data would be stored in cloud storage.
Copy and paste the following code into the new empty notebook cell. This code creates the DataFrame with additional baby name data, and then writes that data to a CSV file in your Unity Catalog volume.
data = [[2022, "CARL", "Albany", "M", 42]]
df = spark.createDataFrame(data, schema="Year int, First_Name STRING, County STRING, Sex STRING, Count int")
# display(df)
(df.coalesce(1)
.write
.option("header", "true")
.mode("overwrite")
.csv(f"{path_volume}/{file_name}"))
val data = Seq((2022, "CARL", "Albany", "M", 42))
val columns = Seq("Year", "First_Name", "County", "Sex", "Count")
val df = data.toDF(columns: _*)
// display(df)
df.coalesce(1)
.write
.option("header", "true")
.mode("overwrite")
.csv(f"{pathVolume}/{fileName}")
# Load the SparkR package that is already preinstalled on the cluster.
library(SparkR)
data <- data.frame(Year = 2022,
First_Name = "CARL",
County = "Albany",
Sex = "M",
Count = 42)
df <- createDataFrame(data)
# display(df)
write.df(df, path = paste0(path_volume, "/", file_name),
source = "csv",
mode = "overwrite",
header = "true")
Copy and paste the following code into an empty notebook cell. This code loads the new baby names data into a new DataFrame from the CSV file.
df1 = spark.read.csv(f"{path_volume}/{file_name}",
header=True,
inferSchema=True,
sep=",")
display(df1)
val df1 = spark.read
.option("header", "true")
.option("inferSchema", "true")
.option("delimiter", ",")
.csv(s"$pathVolume/$fileName")
display(df1)
df1 <- read.df(paste0(path_volume, "/", file_name),
source = "csv",
header = TRUE,
inferSchema = TRUE)
display(df1)
Step 5: Insert into existing table
Copy and paste the following code into an empty notebook cell. This code appends the new baby names data from the DataFrame into the existing table.
df.write.mode("append").insertInto(f"{path_table}.{table_name}")
display(spark.sql(f"SELECT * FROM {path_table}.{table_name} WHERE Year = 2022"))
df1.write.mode("append").insertInto(s"${pathTable}.${tableName}")
display(spark.sql(s"SELECT * FROM ${pathTable}.${tableName} WHERE Year = 2022"))
# The write.df function in R, as provided by the SparkR package, does not directly support writing to Unity Catalog.
# In this example, you write the DataFrame into a temporary view and then use the SQL command to insert data from the temporary view to the Unity Catalog table
createOrReplaceTempView(df1, "temp_view")
sql(paste0("INSERT INTO ", path_table, ".", table_name, " SELECT * FROM temp_view"))
display(sql(paste0("SELECT * FROM ", path_table, ".", table_name, " WHERE Year = 2022")))
Next steps
To learn about cleansing and enhancing data, see Get started: Enhance and cleanse data.
Additional resources
Get started: Query and visualize data from a notebook
Get started: Import and visualize CSV data from a notebook
Tutorial: Load and transform data using Apache Spark DataFrames