from pyspark.sql.functions import lit
df = sqlContext.createDataFrame(
[(1, "a", 23.0), (3, "B", -23.0)], ("x1", "x2", "x3"))
df_with_x4 = df.withColumn("x4", lit(0))
df_with_x4.show()
from pyspark.sql.functions import lit
df = sqlContext.createDataFrame(
[(1, "a", 23.0), (3, "B", -23.0)], ("x1", "x2", "x3"))
df_with_x4 = df.withColumn("x4", lit(0))
df_with_x4.show()
## +---+---+-----+---+
## | x1| x2| x3| x4|
## +---+---+-----+---+
## | 1| a| 23.0| 0|
## | 3| B|-23.0| 0|
## +---+---+-----+---+
from pyspark.sql.functions import lit
df = sqlContext.createDataFrame(
[(1, "a", 23.0), (3, "B", -23.0)], ("x1", "x2", "x3"))
df_with_x4 = df.withColumn("x4", lit(0))
df_with_x4.show()
## +---+---+-----+---+
## | x1| x2| x3| x4|
## +---+---+-----+---+
## | 1| a| 23.0| 0|
## | 3| B|-23.0| 0|
## +---+---+-----+---+
val myDF = sqlContext.parquetFile("hdfs:/to/my/file.parquet")
myDF.withColumn("Code", coder(myDF("Amt")))
from pyspark.sql.functions import expr
# Using withColumn() method
foo2 = (foo.withColumn(
"status", expr("CASE WHEN delay <= 10 THEN 'On-time' ELSE 'Delayed' END")
))
# Output:
# +--------+-----+--------+------+-----------+-------+
# | date|delay|distance|origin|destination| status|
# +--------+-----+--------+------+-----------+-------+
# |01010710| 31| 590| SEA| SFO|Delayed|
# |01010955| 104| 590| SEA| SFO|Delayed|
# |01010730| 5| 590| SEA| SFO|On-time|
# +--------+-----+--------+------+-----------+-------+