  • 输入网页链接,自动生成快照
  • 标签化管理网页链接
defined class A scala> case class B (c: List[A], d: Map[String, A], e: Map[Int, String], f: Map[A, String]) defined class B scala> def a_gen(i: Int) = A(s "str_$i" , i) a_gen: (i: Int)A scala> def b_gen(i: Int) = B(( 1 to 10 ).map(a_gen).toList, ( 1 to 10 ).map(j => s "key_$j" -> a_gen(j)).toMap, ( 1 to 10 ).map(j => j -> s "value_$j" ).toMap, ( 1 to 10 ).map(j => a_gen(j) -> s "value_$j" ).toMap) b_gen: (i: Int)B scala> val data = ( 1 to 10 ).map(b_gen) scala> val df = spark.createDataFrame(data) df: org.apache.spark.sql.DataFrame = [c: array<struct<a:string,b:int>>, d: map<string,struct<a:string,b:int>> ... 2 more fields] scala> df.show +--------------------+--------------------+--------------------+--------------------+ | c| d| e| f| +--------------------+--------------------+--------------------+--------------------+ |[[str_1, 1 ], [str...|[key_2 -> [str_2,...|[ 5 -> value_5, 10. ..|[[str_8, 8 ] -> va...| |[[str_1, 1 ], [str...|[key_2 -> [str_2,...|[ 5 -> value_5, 10. ..|[[str_8, 8 ] -> va...| |[[str_1, 1 ], [str...|[key_2 -> [str_2,...|[ 5 -> value_5, 10. ..|[[str_8, 8 ] -> va...| |[[str_1, 1 ], [str...|[key_2 -> [str_2,...|[ 5 -> value_5, 10. ..|[[str_8, 8 ] -> va...| |[[str_1, 1 ], [str...|[key_2 -> [str_2,...|[ 5 -> value_5, 10. ..|[[str_8, 8 ] -> va...| |[[str_1, 1 ], [str...|[key_2 -> [str_2,...|[ 5 -> value_5, 10. ..|[[str_8, 8 ] -> va...| |[[str_1, 1 ], [str...|[key_2 -> [str_2,...|[ 5 -> value_5, 10. ..|[[str_8, 8 ] -> va...| |[[str_1, 1 ], [str...|[key_2 -> [str_2,...|[ 5 -> value_5, 10. ..|[[str_8, 8 ] -> va...| |[[str_1, 1 ], [str...|[key_2 -> [str_2,...|[ 5 -> value_5, 10. ..|[[str_8, 8 ] -> va...| |[[str_1, 1 ], [str...|[key_2 -> [str_2,...|[ 5 -> value_5, 10. ..|[[str_8, 8 ] -> va...| +--------------------+--------------------+--------------------+--------------------+ scala> df.printSchema |-- c: array (nullable = true ) | |-- element: struct (containsNull = true ) | | |-- a: string (nullable = true ) | | |-- b: integer (nullable = false ) |-- d: map (nullable = true ) | |-- key: string | |-- value: struct (valueContainsNull = true ) | | |-- a: string (nullable = true ) | | |-- b: integer (nullable = false ) |-- e: map (nullable = true ) | |-- key: integer | |-- value: string (valueContainsNull = true ) |-- f: map (nullable = true ) | |-- key: struct | |-- value: string (valueContainsNull = true ) | | |-- a: string (nullable = true ) | | |-- b: integer (nullable = false )

数组\列表 array 的索引方式

我们首先来看一下数组\列表 array 的索引方式:

scala> df.select("c.a").show(10, false)
|a                                                                      |
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
|[str_1, str_2, str_3, str_4, str_5, str_6, str_7, str_8, str_9, str_10]|
scala> df.select("c.a").printSchema
 |-- a: array (nullable = true)
 |    |-- element: string (containsNull = true)
//> SELECT explode(array(10, 20));
// 10
// 20
scala> df.select(expr("explode(c.a)")).show
|   col|
| str_1|
| str_2|
| str_3|
| str_4|
| str_5|
| str_6|
| str_7|
| str_8|
| str_9|
| str_1|
| str_2|
| str_3|
| str_4|
| str_5|
| str_6|
| str_7|
| str_8|
| str_9|
only showing top 20 rows
scala> df.select(expr("explode(c.a)")).printSchema
 |-- col: string (nullable = true)
scala> df.select(expr("explode(c)")).show
|         col|
|  [str_1, 1]|
|  [str_2, 2]|
|  [str_3, 3]|
|  [str_4, 4]|
|  [str_5, 5]|
|  [str_6, 6]|
|  [str_7, 7]|
|  [str_8, 8]|
|  [str_9, 9]|
|[str_10, 10]|
|  [str_1, 1]|
|  [str_2, 2]|
|  [str_3, 3]|
|  [str_4, 4]|
|  [str_5, 5]|
|  [str_6, 6]|
|  [str_7, 7]|
|  [str_8, 8]|
|  [str_9, 9]|
|[str_10, 10]|
only showing top 20 rows
scala> df.select(expr("explode(c)")).printSchema
 |-- col: struct (nullable = true)
 |    |-- a: string (nullable = true)
 |    |-- b: integer (nullable = false)
scala> df.select(expr("inline(c)")).show
|     a|  b|
| str_1|  1|
| str_2|  2|
| str_3|  3|
| str_4|  4|
| str_5|  5|
| str_6|  6|
| str_7|  7|
| str_8|  8|
| str_9|  9|
|str_10| 10|
| str_1|  1|
| str_2|  2|
| str_3|  3|
| str_4|  4|
| str_5|  5|
| str_6|  6|
| str_7|  7|
| str_8|  8|
| str_9|  9|
|str_10| 10|
only showing top 20 rows
scala> df.select(expr("inline(c)")).printSchema
 |-- a: string (nullable = true)
 |-- b: integer (nullable = false)
scala> df.select(expr("posexplode(d)")).printSchema
 |-- pos: integer (nullable = false)
 |-- key: string (nullable = false)
 |-- value: struct (nullable = true)
 |    |-- a: string (nullable = true)
 |    |-- b: integer (nullable = false)
scala> df.select(expr("posexplode(e)")).printSchema
 |-- pos: integer (nullable = false)
 |-- key: integer (nullable = false)
 |-- value: string (nullable = true)
scala> df.select(expr("posexplode(f)")).show
|pos|         key|   value|
|  0|  [str_8, 8]| value_8|
|  1|[str_10, 10]|value_10|
|  2|  [str_3, 3]| value_3|
|  3|  [str_1, 1]| value_1|
|  4|  [str_6, 6]| value_6|
|  5|  [str_5, 5]| value_5|
|  6|  [str_7, 7]| value_7|
|  7|  [str_2, 2]| value_2|
|  8|  [str_4, 4]| value_4|
|  9|  [str_9, 9]| value_9|
|  0|  [str_8, 8]| value_8|
|  1|[str_10, 10]|value_10|
|  2|  [str_3, 3]| value_3|
|  3|  [str_1, 1]| value_1|
|  4|  [str_6, 6]| value_6|
|  5|  [str_5, 5]| value_5|
|  6|  [str_7, 7]| value_7|
|  7|  [str_2, 2]| value_2|
|  8|  [str_4, 4]| value_4|
|  9|  [str_9, 9]| value_9|
scala> df.select(expr("posexplode(f)")).printSchema
 |-- pos: integer (nullable = false)
 |-- key: struct (nullable = false)
 |    |-- a: string (nullable = true)
 |    |-- b: integer (nullable = false)
 |-- value: string (nullable = true)
scala> df.select("d.key_1").show
|     key_1|
|[str_1, 1]|
|[str_1, 1]|
|[str_1, 1]|
|[str_1, 1]|
|[str_1, 1]|
|[str_1, 1]|
|[str_1, 1]|
|[str_1, 1]|
|[str_1, 1]|
|[str_1, 1]|
scala> df.select("d.key_1").printSchema
 |-- key_1: struct (nullable = true)
 |    |-- a: string (nullable = true)
 |    |-- b: integer (nullable = false)
scala> df.select("e.1").show
|      1|
scala> df.select("e.1").printSchema
 |-- 1: string (nullable = true)



scala> df.select(expr("f[('str_1' AS a, 1 AS b)]")).show
|f[named_struct(a, str_1 AS `a`, b, 1 AS `b`)]|
|                                      value_1|
|                                      value_1|
|                                      value_1|
|                                      value_1|
|                                      value_1|
|                                      value_1|
|                                      value_1|
|                                      value_1|
|                                      value_1|
|                                      value_1|
scala> df.select(expr("f[('str_1' AS a, 1 AS b)]")).printSchema
 |-- f[named_struct(a, str_1 AS `a`, b, 1 AS `b`)]: string (nullable = true)



    : #前面太长不看
    | '(' namedExpression (',' namedExpression)+ ')'         #rowConstructor
    | value=primaryExpression '[' index=valueExpression ']'  #subscript
    : primaryExpression                                                                      
    : expression (AS? (identifier | identifierList))?

3、primaryExpression可以是一个'(' namedExpression (',' namedExpression)+ ')'结构
4、namedExpression又是一个exp AS alias的结构
