Skip to content
Home » SQL Server » SQL vs PySpark: A Complete Guide with 50 Query Comparisons

SQL vs PySpark: A Complete Guide with 50 Query Comparisons

5/5 - (1 vote)

SQL vs PySpark for Data Engineers: 50 Hands-On Comparisons

1. Select all columns

SQL: SELECT * FROM employees;
PySpark: df = spark.table("employees")

2. Select specific columns

SQL: SELECT name, salary FROM employees;
PySpark: df.select("name", "salary")

3. Filter rows

SQL: SELECT * FROM employees WHERE salary > 50000;
PySpark: df.filter(df.salary > 50000)

4. Multiple conditions (AND)

SQL: SELECT * FROM employees WHERE salary > 50000 AND dept = 'IT';
PySpark: df.filter((df.salary > 50000) & (df.dept == "IT"))

5. OR condition

SQL: SELECT * FROM employees WHERE dept = 'HR' OR dept = 'IT';
PySpark: df.filter((df.dept == "HR") | (df.dept == "IT"))

6. LIKE pattern

SQL: SELECT * FROM employees WHERE name LIKE 'A%';
PySpark: df.filter(df.name.like("A%"))

7. IN clause

SQL: SELECT * FROM employees WHERE dept IN ('HR','IT');
PySpark: df.filter(df.dept.isin("HR","IT"))

8. BETWEEN

SQL: SELECT * FROM employees WHERE salary BETWEEN 30000 AND 60000;
PySpark: df.filter(df.salary.between(30000, 60000))




9. ORDER BY

SQL: SELECT * FROM employees ORDER BY salary DESC;
PySpark: df.orderBy(df.salary.desc())

10. LIMIT

SQL: SELECT * FROM employees LIMIT 5;
PySpark: df.limit(5)

11. DISTINCT

SQL: SELECT DISTINCT dept FROM employees;
PySpark: df.select("dept").distinct()

12. COUNT

SQL: SELECT COUNT(*) FROM employees;
PySpark: df.count()

13. SUM

SQL: SELECT SUM(salary) FROM employees;
PySpark: df.agg({"salary":"sum"})

14. AVG

SQL: SELECT AVG(salary) FROM employees;
PySpark: df.agg({"salary":"avg"})

15. GROUP BY

SQL: SELECT dept, COUNT(*) FROM employees GROUP BY dept;
PySpark: df.groupBy("dept").count()

16. GROUP BY with SUM

SQL: SELECT dept, SUM(salary) FROM employees GROUP BY dept;
PySpark: df.groupBy("dept").sum("salary")

17. HAVING

SQL: SELECT dept, COUNT(*) FROM employees GROUP BY dept HAVING COUNT(*) > 5;
PySpark: df.groupBy("dept").count().filter("count > 5")

18. INNER JOIN

SQL: SELECT e.name, d.dept_name FROM employees e INNER JOIN dept d ON e.dept_id = d.id;
PySpark: df.join(dept_df, df.dept_id == dept_df.id, "inner").select(df.name, dept_df.dept_name)

19. LEFT JOIN

SQL: SELECT * FROM employees e LEFT JOIN dept d ON e.dept_id = d.id;
PySpark: df.join(dept_df, df.dept_id == dept_df.id, "left")

20. RIGHT JOIN

SQL: SELECT * FROM employees e RIGHT JOIN dept d ON e.dept_id = d.id;
PySpark: df.join(dept_df, df.dept_id == dept_df.id, "right")

21. FULL JOIN

SQL: SELECT * FROM employees e FULL JOIN dept d ON e.dept_id = d.id;
PySpark: df.join(dept_df, df.dept_id == dept_df.id, "full")




22. Alias column

SQL: SELECT name AS employee_name FROM employees;
PySpark: df.select(df.name.alias("employee_name"))

23. Column expression

SQL: SELECT salary * 2 AS double_salary FROM employees;
PySpark: df.select((df.salary * 2).alias("double_salary"))

24. CASE WHEN

SQL: SELECT name, CASE WHEN salary > 50000 THEN 'High' ELSE 'Low' END FROM employees;
PySpark: df.select("name", when(df.salary > 50000, "High").otherwise("Low"))

25. NULL check

SQL: SELECT * FROM employees WHERE salary IS NULL;
PySpark: df.filter(df.salary.isNull())

26. NOT NULL

SQL: SELECT * FROM employees WHERE salary IS NOT NULL;
PySpark: df.filter(df.salary.isNotNull())

27. Drop duplicates

SQL: SELECT DISTINCT * FROM employees;
PySpark: df.dropDuplicates()

28. UNION

SQL: SELECT * FROM emp1 UNION SELECT * FROM emp2;
PySpark: df1.union(df2).distinct()

29. UNION ALL

SQL: SELECT * FROM emp1 UNION ALL SELECT * FROM emp2;
PySpark: df1.union(df2)

30. INTERSECT

SQL: SELECT * FROM emp1 INTERSECT SELECT * FROM emp2;
PySpark: df1.intersect(df2)

31. EXCEPT

SQL: SELECT * FROM emp1 EXCEPT SELECT * FROM emp2;
PySpark: df1.subtract(df2)

32. Create view

SQL: CREATE VIEW emp_view AS SELECT * FROM employees;
PySpark: df.createOrReplaceTempView("emp_view")

33. Subquery

SQL: SELECT * FROM employees WHERE salary > (SELECT AVG(salary) FROM employees);
PySpark: avg_sal = df.agg({"salary":"avg"}).collect()[0][0]; df.filter(df.salary > avg_sal)

34. ROW_NUMBER

SQL: SELECT name, ROW_NUMBER() OVER (PARTITION BY dept ORDER BY salary DESC) FROM employees;
PySpark: df.withColumn("rn", row_number().over(Window.partitionBy("dept").orderBy(df.salary.desc())))

35. RANK

SQL: SELECT name, RANK() OVER (ORDER BY salary DESC) FROM employees;
PySpark: df.withColumn("rank", rank().over(Window.orderBy(df.salary.desc())))

36. DENSE_RANK

SQL: SELECT name, DENSE_RANK() OVER (ORDER BY salary DESC) FROM employees;
PySpark: df.withColumn("dense_rank", dense_rank().over(Window.orderBy(df.salary.desc())))

37. LEAD

SQL: SELECT name, LEAD(salary) OVER (ORDER BY salary) FROM employees;
PySpark: df.withColumn("lead_salary", lead("salary").over(Window.orderBy("salary")))

38. LAG

SQL: SELECT name, LAG(salary) OVER (ORDER BY salary) FROM employees;
PySpark: df.withColumn("lag_salary", lag("salary").over(Window.orderBy("salary")))

39. CURRENT DATE

SQL: SELECT CURRENT_DATE;
PySpark: spark.range(1).select(current_date())

40. DATE ADD

SQL: SELECT DATE_ADD(CURRENT_DATE, 5);
PySpark: spark.range(1).select(date_add(current_date(), 5))

41. CONCAT

SQL: SELECT CONCAT(name, dept) FROM employees;
PySpark: df.select(concat("name","dept"))

42. LENGTH

SQL: SELECT LENGTH(name) FROM employees;
PySpark: df.select(length("name"))




43. UPPER

SQL: SELECT UPPER(name) FROM employees;
PySpark: df.select(upper("name"))

44. LOWER

SQL: SELECT LOWER(name) FROM employees;
PySpark: df.select(lower("name"))

45. TRIM

SQL: SELECT TRIM(name) FROM employees;
PySpark: df.select(trim("name"))

46. CAST

SQL: SELECT CAST(salary AS STRING) FROM employees;
PySpark: df.select(df.salary.cast("string"))

47. DROP COLUMN

SQL: ALTER TABLE employees DROP COLUMN age;
PySpark: df.drop("age")

48. RENAME COLUMN

SQL: ALTER TABLE employees RENAME COLUMN name TO emp_name;
PySpark: df.withColumnRenamed("name", "emp_name")

49. ADD COLUMN

SQL: SELECT *, salary*0.1 AS bonus FROM employees;
PySpark: df.withColumn("bonus", df.salary * 0.1)

50. COALESCE (handle NULL values)

SQL: SELECT COALESCE(salary, 0) FROM employees;
PySpark: df.select(coalesce(df.salary, lit(0)))




Loading

Leave a Reply

Discover more from Power BI Docs

Subscribe now to keep reading and get access to the full archive.

Continue reading