#create dataframe from RDD and create Table from dataframe
from pyspark import SparkContext, SparkConf,HiveContext
%sql
Employee=[(1,"jonny","2000","sales"),
(2,"jack","4000","finance"),
(3,"dinarys","3000","acc"),
(4,"john","5000","hr"),
(5,"theon","3000","sales"),
(6,"bran","5000","finance"),
(7,"john","2000","acc"),
(9,"johny","2000","acc"),
(8,"sansa","2000","hr")]
EmployeeRDD=sc.parallelize(Employee)
EmployeeDF=sqlContext.createDataFrame(EmployeeRDD).toDF('Eid','Emp_Name','Emp_Salary','Dept')
EmployeeDF.registerTempTable('EmployeeTB')
result1=sqlContext.sql("select * from EmployeeTB limit 3")
result1.show()
There are total 4 value functions:
1) Lead
2) Lag
3) First Value
4) Last Value
1) Lead:
The LEAD() window function returns the value for the row after the current row in a partition. If no row exists, null is returned.
from pyspark import SparkContext, SparkConf,HiveContext
%sql
Employee=[(1,"jonny","2000","sales"),
(2,"jack","4000","finance"),
(3,"dinarys","3000","acc"),
(4,"john","5000","hr"),
(5,"theon","3000","sales"),
(6,"bran","5000","finance"),
(7,"john","2000","acc"),
(9,"johny","2000","acc"),
(8,"sansa","2000","hr")]
EmployeeRDD=sc.parallelize(Employee)
EmployeeDF=sqlContext.createDataFrame(EmployeeRDD).toDF('Eid','Emp_Name','Emp_Salary','Dept')
EmployeeDF.registerTempTable('EmployeeTB')
result1=sqlContext.sql("select * from EmployeeTB limit 3")
result1.show()
There are total 4 value functions:
1) Lead
2) Lag
3) First Value
4) Last Value
1) Lead:
The LEAD() window function returns the value for the row after the current row in a partition. If no row exists, null is returned.
ResultLead=sqlContext.sql("select Eid,Emp_Name,Emp_Salary,Dept,Lead(Emp_salary,1,0) over (partition by Dept order by Emp_Salary) as LeadSal from EmployeeTB")
ResultLead.show()
2.lag()
The LAG() window function returns the value for the row before the current row in a partition. If no row exists, null is returned.
ResultLag=sqlContext.sql("select Eid,Emp_Name,Emp_Salary,Dept,Lag(Emp_salary,1,0) over (partition by Dept order by Emp_Salary) as LagSal from EmployeeTB")
ResultLag.show()
3.First_value()
The FIRST_VALUE window function returns the value of the specified expression with respect to the first row in the window frame.
ResultFirst=sqlContext.sql("select Eid,Emp_Name,Emp_Salary,Dept,first_value(Emp_salary) over (partition by Dept order by Emp_Salary asc) as FirstSal from EmployeeTB")
ResultFirst.show()
4.Last_value()
The LAST_VALUE window function returns the value of the specified expression with respect to the last row in the window frame.
ResultLast=sqlContext.sql("select Eid,Emp_Name,Emp_Salary,Dept,last_value(Emp_salary) over (partition by Dept ) as LastSal from EmployeeTB")
ResultLast.show()
No comments:
Post a Comment