首页 > 代码库 > Spark DataFrame 数据框空值判断和处理

Spark DataFrame 数据框空值判断和处理

scala>     val data1 = data.toDF("affairs", "gender", "age", "yearsmarried", "children", "religiousness", "education", "occupation", "rating")
data1: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]

scala>     data1.limit(10).show
+-------+------+---+------------+--------+-------------+---------+----------+------+
|affairs|gender|age|yearsmarried|children|religiousness|education|occupation|rating|
+-------+------+---+------------+--------+-------------+---------+----------+------+
|      0|  male| 37|          10|      no|            3|       18|         7|     4|
|      0|  null| 27|        null|      no|            4|       14|         6|  null|
|      0|  null| 32|        null|     yes|            1|       12|         1|  null|
|      0|  null| 57|        null|     yes|            5|       18|         6|  null|
|      0|  null| 22|        null|      no|            2|       17|         6|  null|
|      0|  null| 32|        null|      no|            2|       17|         5|  null|
|      0|female| 22|        null|      no|            2|       12|         1|  null|
|      0|  male| 57|          15|     yes|            2|       14|         4|     4|
|      0|female| 32|          15|     yes|            4|       16|         1|     2|
|      0|  male| 22|         1.5|      no|            4|       14|         4|     5|
+-------+------+---+------------+--------+-------------+---------+----------+------+


scala>     
     |     val res=data1.select("yearsmarried").na.drop()
res: org.apache.spark.sql.DataFrame = [yearsmarried: string]

scala>     res.limit(10).show()
+------------+
|yearsmarried|
+------------+
|          10|
|          15|
|          15|
|         1.5|
|          15|
|           4|
|          15|
|         1.5|
|           4|
|          15|
+------------+


scala>     
     |     val res123=data1.na.fill("wangxiao123")
res123: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]

scala>     res123.limit(10).show()
+-------+-----------+---+------------+--------+-------------+---------+----------+-----------+
|affairs|     gender|age|yearsmarried|children|religiousness|education|occupation|     rating|
+-------+-----------+---+------------+--------+-------------+---------+----------+-----------+
|      0|       male| 37|          10|      no|            3|       18|         7|          4|
|      0|wangxiao123| 27| wangxiao123|      no|            4|       14|         6|wangxiao123|
|      0|wangxiao123| 32| wangxiao123|     yes|            1|       12|         1|wangxiao123|
|      0|wangxiao123| 57| wangxiao123|     yes|            5|       18|         6|wangxiao123|
|      0|wangxiao123| 22| wangxiao123|      no|            2|       17|         6|wangxiao123|
|      0|wangxiao123| 32| wangxiao123|      no|            2|       17|         5|wangxiao123|
|      0|     female| 22| wangxiao123|      no|            2|       12|         1|wangxiao123|
|      0|       male| 57|          15|     yes|            2|       14|         4|          4|
|      0|     female| 32|          15|     yes|            4|       16|         1|          2|
|      0|       male| 22|         1.5|      no|            4|       14|         4|          5|
+-------+-----------+---+------------+--------+-------------+---------+----------+-----------+


scala>     
     |     val res2=data1.na.fill(value="http://www.mamicode.com/wangxiao111",cols=Array("gender","yearsmarried") )
res2: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]

scala>     res2.limit(10).show()
+-------+-----------+---+------------+--------+-------------+---------+----------+------+
|affairs|     gender|age|yearsmarried|children|religiousness|education|occupation|rating|
+-------+-----------+---+------------+--------+-------------+---------+----------+------+
|      0|       male| 37|          10|      no|            3|       18|         7|     4|
|      0|wangxiao111| 27| wangxiao111|      no|            4|       14|         6|  null|
|      0|wangxiao111| 32| wangxiao111|     yes|            1|       12|         1|  null|
|      0|wangxiao111| 57| wangxiao111|     yes|            5|       18|         6|  null|
|      0|wangxiao111| 22| wangxiao111|      no|            2|       17|         6|  null|
|      0|wangxiao111| 32| wangxiao111|      no|            2|       17|         5|  null|
|      0|     female| 22| wangxiao111|      no|            2|       12|         1|  null|
|      0|       male| 57|          15|     yes|            2|       14|         4|     4|
|      0|     female| 32|          15|     yes|            4|       16|         1|     2|
|      0|       male| 22|         1.5|      no|            4|       14|         4|     5|
+-------+-----------+---+------------+--------+-------------+---------+----------+------+


scala>     
     |     val res3=data1.na.fill(Map("gender"->"wangxiao222","yearsmarried"->"wangxiao567") ) 
res3: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]

scala>     res3.limit(10).show()
+-------+-----------+---+------------+--------+-------------+---------+----------+------+
|affairs|     gender|age|yearsmarried|children|religiousness|education|occupation|rating|
+-------+-----------+---+------------+--------+-------------+---------+----------+------+
|      0|       male| 37|          10|      no|            3|       18|         7|     4|
|      0|wangxiao222| 27| wangxiao567|      no|            4|       14|         6|  null|
|      0|wangxiao222| 32| wangxiao567|     yes|            1|       12|         1|  null|
|      0|wangxiao222| 57| wangxiao567|     yes|            5|       18|         6|  null|
|      0|wangxiao222| 22| wangxiao567|      no|            2|       17|         6|  null|
|      0|wangxiao222| 32| wangxiao567|      no|            2|       17|         5|  null|
|      0|     female| 22| wangxiao567|      no|            2|       12|         1|  null|
|      0|       male| 57|          15|     yes|            2|       14|         4|     4|
|      0|     female| 32|          15|     yes|            4|       16|         1|     2|
|      0|       male| 22|         1.5|      no|            4|       14|         4|     5|
+-------+-----------+---+------------+--------+-------------+---------+----------+------+


scala>     
     |     
     |     data1.filter("gender is null").limit(10).show
+-------+------+---+------------+--------+-------------+---------+----------+------+
|affairs|gender|age|yearsmarried|children|religiousness|education|occupation|rating|
+-------+------+---+------------+--------+-------------+---------+----------+------+
|      0|  null| 27|        null|      no|            4|       14|         6|  null|
|      0|  null| 32|        null|     yes|            1|       12|         1|  null|
|      0|  null| 57|        null|     yes|            5|       18|         6|  null|
|      0|  null| 22|        null|      no|            2|       17|         6|  null|
|      0|  null| 32|        null|      no|            2|       17|         5|  null|
+-------+------+---+------------+--------+-------------+---------+----------+------+


scala>     data1.filter( data1("gender").isNull ).limit(10).show
+-------+------+---+------------+--------+-------------+---------+----------+------+
|affairs|gender|age|yearsmarried|children|religiousness|education|occupation|rating|
+-------+------+---+------------+--------+-------------+---------+----------+------+
|      0|  null| 27|        null|      no|            4|       14|         6|  null|
|      0|  null| 32|        null|     yes|            1|       12|         1|  null|
|      0|  null| 57|        null|     yes|            5|       18|         6|  null|
|      0|  null| 22|        null|      no|            2|       17|         6|  null|
|      0|  null| 32|        null|      no|            2|       17|         5|  null|
+-------+------+---+------------+--------+-------------+---------+----------+------+


scala>     
     |     
     |     math.sqrt(-1.0)
res32: Double = NaN

scala>     math.sqrt(-1.0).isNaN()
res33: Boolean = true

 

Spark DataFrame 数据框空值判断和处理