SQL vs PySpark for Data Engineers: 50 Hands-On Comparisons
1. Select all columns
SQL: SELECT * FROM employees;
PySpark: df = spark.table("employees")
2. Select specific columns
SQL: SELECT name, salary FROM employees;
PySpark: df.select("name", "salary")
3. Filter rows
SQL: SELECT * FROM employees WHERE salary > 50000; PySpark: df.filter(df.salary > 50000)
4. Multiple conditions (AND)
SQL: SELECT * FROM employees WHERE salary > 50000 AND dept = 'IT'; PySpark: df.filter((df.salary > 50000) & (df.dept == "IT"))
5. OR condition
SQL: SELECT * FROM employees WHERE dept = 'HR' OR dept = 'IT'; PySpark: df.filter((df.dept == "HR") | (df.dept == "IT"))
6. LIKE pattern
SQL: SELECT * FROM employees WHERE name LIKE 'A%';
PySpark: df.filter(df.name.like("A%"))
7. IN clause
SQL: SELECT * FROM employees WHERE dept IN ('HR','IT');
PySpark: df.filter(df.dept.isin("HR","IT"))
8. BETWEEN
SQL: SELECT * FROM employees WHERE salary BETWEEN 30000 AND 60000; PySpark: df.filter(df.salary.between(30000, 60000))
9. ORDER BY
SQL: SELECT * FROM employees ORDER BY salary DESC; PySpark: df.orderBy(df.salary.desc())
10. LIMIT
SQL: SELECT * FROM employees LIMIT 5; PySpark: df.limit(5)
11. DISTINCT
SQL: SELECT DISTINCT dept FROM employees;
PySpark: df.select("dept").distinct()
12. COUNT
SQL: SELECT COUNT(*) FROM employees; PySpark: df.count()
13. SUM
SQL: SELECT SUM(salary) FROM employees;
PySpark: df.agg({"salary":"sum"})
14. AVG
SQL: SELECT AVG(salary) FROM employees;
PySpark: df.agg({"salary":"avg"})
15. GROUP BY
SQL: SELECT dept, COUNT(*) FROM employees GROUP BY dept;
PySpark: df.groupBy("dept").count()
16. GROUP BY with SUM
SQL: SELECT dept, SUM(salary) FROM employees GROUP BY dept;
PySpark: df.groupBy("dept").sum("salary")
17. HAVING
SQL: SELECT dept, COUNT(*) FROM employees GROUP BY dept HAVING COUNT(*) > 5;
PySpark: df.groupBy("dept").count().filter("count > 5")
18. INNER JOIN
SQL: SELECT e.name, d.dept_name FROM employees e INNER JOIN dept d ON e.dept_id = d.id; PySpark: df.join(dept_df, df.dept_id == dept_df.id, "inner").select(df.name, dept_df.dept_name)
19. LEFT JOIN
SQL: SELECT * FROM employees e LEFT JOIN dept d ON e.dept_id = d.id; PySpark: df.join(dept_df, df.dept_id == dept_df.id, "left")
20. RIGHT JOIN
SQL: SELECT * FROM employees e RIGHT JOIN dept d ON e.dept_id = d.id; PySpark: df.join(dept_df, df.dept_id == dept_df.id, "right")
21. FULL JOIN
SQL: SELECT * FROM employees e FULL JOIN dept d ON e.dept_id = d.id; PySpark: df.join(dept_df, df.dept_id == dept_df.id, "full")
22. Alias column
SQL: SELECT name AS employee_name FROM employees;
PySpark: df.select(df.name.alias("employee_name"))
23. Column expression
SQL: SELECT salary * 2 AS double_salary FROM employees;
PySpark: df.select((df.salary * 2).alias("double_salary"))
24. CASE WHEN
SQL: SELECT name, CASE WHEN salary > 50000 THEN 'High' ELSE 'Low' END FROM employees;
PySpark: df.select("name", when(df.salary > 50000, "High").otherwise("Low"))
25. NULL check
SQL: SELECT * FROM employees WHERE salary IS NULL; PySpark: df.filter(df.salary.isNull())
26. NOT NULL
SQL: SELECT * FROM employees WHERE salary IS NOT NULL; PySpark: df.filter(df.salary.isNotNull())
27. Drop duplicates
SQL: SELECT DISTINCT * FROM employees; PySpark: df.dropDuplicates()
28. UNION
SQL: SELECT * FROM emp1 UNION SELECT * FROM emp2; PySpark: df1.union(df2).distinct()
29. UNION ALL
SQL: SELECT * FROM emp1 UNION ALL SELECT * FROM emp2; PySpark: df1.union(df2)
30. INTERSECT
SQL: SELECT * FROM emp1 INTERSECT SELECT * FROM emp2; PySpark: df1.intersect(df2)
31. EXCEPT
SQL: SELECT * FROM emp1 EXCEPT SELECT * FROM emp2; PySpark: df1.subtract(df2)
32. Create view
SQL: CREATE VIEW emp_view AS SELECT * FROM employees;
PySpark: df.createOrReplaceTempView("emp_view")
33. Subquery
SQL: SELECT * FROM employees WHERE salary > (SELECT AVG(salary) FROM employees);
PySpark: avg_sal = df.agg({"salary":"avg"}).collect()[0][0]; df.filter(df.salary > avg_sal)
34. ROW_NUMBER
SQL: SELECT name, ROW_NUMBER() OVER (PARTITION BY dept ORDER BY salary DESC) FROM employees;
PySpark: df.withColumn("rn", row_number().over(Window.partitionBy("dept").orderBy(df.salary.desc())))
35. RANK
SQL: SELECT name, RANK() OVER (ORDER BY salary DESC) FROM employees;
PySpark: df.withColumn("rank", rank().over(Window.orderBy(df.salary.desc())))
36. DENSE_RANK
SQL: SELECT name, DENSE_RANK() OVER (ORDER BY salary DESC) FROM employees;
PySpark: df.withColumn("dense_rank", dense_rank().over(Window.orderBy(df.salary.desc())))
37. LEAD
SQL: SELECT name, LEAD(salary) OVER (ORDER BY salary) FROM employees;
PySpark: df.withColumn("lead_salary", lead("salary").over(Window.orderBy("salary")))
38. LAG
SQL: SELECT name, LAG(salary) OVER (ORDER BY salary) FROM employees;
PySpark: df.withColumn("lag_salary", lag("salary").over(Window.orderBy("salary")))
39. CURRENT DATE
SQL: SELECT CURRENT_DATE; PySpark: spark.range(1).select(current_date())
40. DATE ADD
SQL: SELECT DATE_ADD(CURRENT_DATE, 5); PySpark: spark.range(1).select(date_add(current_date(), 5))
41. CONCAT
SQL: SELECT CONCAT(name, dept) FROM employees;
PySpark: df.select(concat("name","dept"))
42. LENGTH
SQL: SELECT LENGTH(name) FROM employees;
PySpark: df.select(length("name"))
43. UPPER
SQL: SELECT UPPER(name) FROM employees;
PySpark: df.select(upper("name"))
44. LOWER
SQL: SELECT LOWER(name) FROM employees;
PySpark: df.select(lower("name"))
45. TRIM
SQL: SELECT TRIM(name) FROM employees;
PySpark: df.select(trim("name"))
46. CAST
SQL: SELECT CAST(salary AS STRING) FROM employees;
PySpark: df.select(df.salary.cast("string"))
47. DROP COLUMN
SQL: ALTER TABLE employees DROP COLUMN age;
PySpark: df.drop("age")
48. RENAME COLUMN
SQL: ALTER TABLE employees RENAME COLUMN name TO emp_name;
PySpark: df.withColumnRenamed("name", "emp_name")
49. ADD COLUMN
SQL: SELECT *, salary*0.1 AS bonus FROM employees;
PySpark: df.withColumn("bonus", df.salary * 0.1)
50. COALESCE (handle NULL values)
SQL: SELECT COALESCE(salary, 0) FROM employees; PySpark: df.select(coalesce(df.salary, lit(0)))
![]()