添加父列名称作为前缀以避免歧义 [英] add parent column name as prefix to avoid ambiguity

查看:23
本文介绍了添加父列名称作为前缀以避免歧义的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

检查下面的代码.如果存在重复键,它将生成具有歧义的数据帧.我们应该如何修改代码以添加父列名称作为前缀.

Check below code. It is generating dataframe with ambiguity if duplicate keys are present . How should we modify the code to add parent column name as prefix to it.

添加了另一个包含 json 数据的列.

Added another column with json data.

scala> val df = Seq(
    (77, "email1", """{"key1":38,"key3":39}""","""{"name":"aaa","age":10}"""),
    (78, "email2", """{"key1":38,"key4":39}""","""{"name":"bbb","age":20}"""),
    (178, "email21", """{"key1":"when string","key4":36, "key6":"test", "key10":false }""","""{"name":"ccc","age":30}"""),
    (179, "email8", """{"sub1":"qwerty","sub2":["42"]}""","""{"name":"ddd","age":40}"""),
    (180, "email8", """{"sub1":"qwerty","sub2":["42", "56", "test"]}""","""{"name":"eee","age":50}""")
).toDF("id", "name", "colJson","personInfo")

scala> df.printSchema
root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = true)
 |-- colJson: string (nullable = true)
 |-- personInfo: string (nullable = true)

scala> df.show(false)
+---+-------+---------------------------------------------------------------+-----------------------+
|id |name   |colJson                                                        |personInfo             |
+---+-------+---------------------------------------------------------------+-----------------------+
|77 |email1 |{"key1":38,"key3":39}                                          |{"name":"aaa","age":10}|
|78 |email2 |{"key1":38,"key4":39}                                          |{"name":"bbb","age":20}|
|178|email21|{"key1":"when string","key4":36, "key6":"test", "key10":false }|{"name":"ccc","age":30}|
|179|email8 |{"sub1":"qwerty","sub2":["42"]}                                |{"name":"ddd","age":40}|
|180|email8 |{"sub1":"qwerty","sub2":["42", "56", "test"]}                  |{"name":"eee","age":50}|
+---+-------+---------------------------------------------------------------+-----------------------+

创建了fromJson隐式函数,你可以将多列传递给这个&它将解析 &从 json 中提取列.

created fromJson implicit function,You can pass multiple columns to this & It will parse & extract the columns from json.

scala> :paste
// Entering paste mode (ctrl-D to finish)

import org.apache.spark.sql.{Column, DataFrame, Row}
    import org.apache.spark.sql.functions.from_json
    implicit class DFHelper(inDF: DataFrame) {
      import inDF.sparkSession.implicits._
      def fromJson(columns:Column*):DataFrame = {
        val schemas = columns.map(column => (column, inDF.sparkSession.read.json(inDF.select(column).as[String]).schema))
        val mdf = schemas.foldLeft(inDF)((df,schema) => {
                df.withColumn(schema._1.toString(),from_json(schema._1,schema._2))
        })        
        mdf.selectExpr(mdf.schema.map(c => if(c.dataType.typeName =="struct") s"${c.name}.*" else c.name):_*)
      }
    }

// Exiting paste mode, now interpreting.

import org.apache.spark.sql.{Column, DataFrame, Row}
import org.apache.spark.sql.functions.from_json
defined class DFHelper

scala> df.fromJson($"colJson",$"personInfo").show(false)

+---+-------+-----------+-----+----+----+----+------+--------------+---+----+
|id |name   |key1       |key10|key3|key4|key6|sub1  |sub2          |age|name|
+---+-------+-----------+-----+----+----+----+------+--------------+---+----+
|77 |email1 |38         |null |39  |null|null|null  |null          |10 |aaa |
|78 |email2 |38         |null |null|39  |null|null  |null          |20 |bbb |
|178|email21|when string|false|null|36  |test|null  |null          |30 |ccc |
|179|email8 |null       |null |null|null|null|qwerty|[42]          |40 |ddd |
|180|email8 |null       |null |null|null|null|qwerty|[42, 56, test]|50 |eee |
+---+-------+-----------+-----+----+----+----+------+--------------+---+----+

scala> df.fromJson($"colJson",$"personInfo").printSchema()
root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = true)
 |-- key1: string (nullable = true)
 |-- key10: boolean (nullable = true)
 |-- key3: long (nullable = true)
 |-- key4: long (nullable = true)
 |-- key6: string (nullable = true)
 |-- sub1: string (nullable = true)
 |-- sub2: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)

推荐答案

试试这个-

  df.show(false)
    df.printSchema()
    /**
      * +---+-------+---------------------------------------------------------------+-----------------------+
      * |id |name   |colJson                                                        |personInfo             |
      * +---+-------+---------------------------------------------------------------+-----------------------+
      * |77 |email1 |{"key1":38,"key3":39}                                          |{"name":"aaa","age":10}|
      * |78 |email2 |{"key1":38,"key4":39}                                          |{"name":"bbb","age":20}|
      * |178|email21|{"key1":"when string","key4":36, "key6":"test", "key10":false }|{"name":"ccc","age":30}|
      * |179|email8 |{"sub1":"qwerty","sub2":["42"]}                                |{"name":"ddd","age":40}|
      * |180|email8 |{"sub1":"qwerty","sub2":["42", "56", "test"]}                  |{"name":"eee","age":50}|
      * +---+-------+---------------------------------------------------------------+-----------------------+
      *
      * root
      * |-- id: integer (nullable = false)
      * |-- name: string (nullable = true)
      * |-- colJson: string (nullable = true)
      * |-- personInfo: string (nullable = true)
      *
      * @param inDF
      */

    implicit class DFHelper(inDF: DataFrame) {
      import inDF.sparkSession.implicits._
      def fromJson(columns:Column*):DataFrame = {
        val schemas = columns.map(column => (column, inDF.sparkSession.read.json(inDF.select(column).as[String]).schema))
        val mdf = schemas.foldLeft(inDF)((df,schema) => {
          df.withColumn(schema._1.toString(),from_json(schema._1,schema._2))
        })
        mdf//.selectExpr(mdf.schema.map(c => if(c.dataType.typeName =="struct") s"${c.name}.*" else c.name):_*)
      }
    }

    val p = df.fromJson($"colJson", $"personInfo")
    p.show(false)
    p.printSchema()
    /**
      * +---+-------+---------------------------------+----------+
      * |id |name   |colJson                          |personInfo|
      * +---+-------+---------------------------------+----------+
      * |77 |email1 |[38,, 39,,,,]                    |[10, aaa] |
      * |78 |email2 |[38,,, 39,,,]                    |[20, bbb] |
      * |178|email21|[when string, false,, 36, test,,]|[30, ccc] |
      * |179|email8 |[,,,,, qwerty, [42]]             |[40, ddd] |
      * |180|email8 |[,,,,, qwerty, [42, 56, test]]   |[50, eee] |
      * +---+-------+---------------------------------+----------+
      *
      * root
      * |-- id: integer (nullable = false)
      * |-- name: string (nullable = true)
      * |-- colJson: struct (nullable = true)
      * |    |-- key1: string (nullable = true)
      * |    |-- key10: boolean (nullable = true)
      * |    |-- key3: long (nullable = true)
      * |    |-- key4: long (nullable = true)
      * |    |-- key6: string (nullable = true)
      * |    |-- sub1: string (nullable = true)
      * |    |-- sub2: array (nullable = true)
      * |    |    |-- element: string (containsNull = true)
      * |-- personInfo: struct (nullable = true)
      * |    |-- age: long (nullable = true)
      * |    |-- name: string (nullable = true)
      */

    // fetch columns of struct using <parent_col>.<child_col>
    p.select($"colJson.key1", $"personInfo.age").show(false)
    /**
      * +-----------+---+
      * |key1       |age|
      * +-----------+---+
      * |38         |10 |
      * |38         |20 |
      * |when string|30 |
      * |null       |40 |
      * |null       |50 |
      * +-----------+---+
      */

这篇关于添加父列名称作为前缀以避免歧义的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆