Lecture 18 - SQLite and Python
SELECT
, WHERE
and ORDER BY
GROUP BY
and filter groups with HAVING
COUNT
, SUM
, AVG
, MIN
, and MAX
sqlite3
library together with pandas
LIKE
, IN
, BETWEEN
, CASE
, window functions, and string functions like SUBSTR
and LENGTH
COALESCE
fetchall
, fetchone
)CASE
CREATE TABLE
.cursor.execute()
to run the command and conn.commit()
to save the change.# Use triple quotes for multi-line SQL
# Drop table if exist
sql_drop = '''
DROP TABLE IF EXISTS drivers;
'''
# Execute the SQL command
cur.execute(sql_drop)
# Create a new table named 'drivers'
sql_create = '''
CREATE TABLE drivers (
driver_id INTEGER PRIMARY KEY AUTOINCREMENT,
driver_name TEXT,
team TEXT,
nationality TEXT,
victories INTEGER
);
'''
# Execute the SQL command
cur.execute(sql_create)
# Commit the transaction to save the table
conn.commit()
# Check if the table was created
cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='drivers';")
<sqlite3.Cursor at 0x103d31bc0>
INSERT INTO
.cursor.execute()
runs the command, and conn.commit()
makes the insertions permanent.# Insert a few rows
cur.execute("INSERT INTO drivers (driver_name, team, nationality, victories) VALUES ('Lewis Hamilton', 'Mercedes', 'British', 103);")
cur.execute("INSERT INTO drivers (driver_name, team, nationality, victories) VALUES ('Max Verstappen', 'Red Bull Racing', 'Dutch', 55);")
cur.execute("INSERT INTO drivers (driver_name, team, nationality, victories) VALUES ('Fernando Alonso', 'Aston Martin', NULL, NULL);")
cur.execute("INSERT INTO drivers (driver_name, team, nationality) VALUES ('Charles Leclerc', 'Ferrari', 'Monégasque');")
# Commit the insertions
conn.commit()
fetchall
) and One Result (fetchone
)SELECT
query, fetchall()
retrieves all results at once into a list of tuples.cur.execute('SELECT driver_id, driver_name, team FROM drivers')
all_rows = cur.fetchall() # Get the list of tuples
# Print the fetched data
for row in all_rows:
print(row)
(1, 'Lewis Hamilton', 'Mercedes')
(2, 'Max Verstappen', 'Red Bull Racing')
(3, 'Fernando Alonso', 'Aston Martin')
(4, 'Charles Leclerc', 'Ferrari')
fetchone()
retrieves rows one by one. Each call gets the next available row.None
when no more rows are left. Good for processing sequentially or just getting the top item.IN
and BETWEEN
OperatorsIN
to check if a column’s value matches any value in a specified list.query_in = "SELECT driver_name, team FROM drivers WHERE team IN ('Ferrari', 'Mercedes')"
cur.execute(query_in)
for row in cur.fetchall():
print(row)
('Lewis Hamilton', 'Mercedes')
('Charles Leclerc', 'Ferrari')
NOT IN
works similarly to exclude values in the list.query_not_in = "SELECT driver_name, team FROM drivers WHERE team NOT IN ('Ferrari', 'Mercedes')"
cur.execute(query_not_in)
for row in cur.fetchall():
print(row)
('Max Verstappen', 'Red Bull Racing')
('Fernando Alonso', 'Aston Martin')
BETWEEN
checks if a value is within a specified range. Inclusive of the endpoints.query_between = "SELECT driver_name, victories FROM drivers WHERE victories BETWEEN 10 AND 60"
cur.execute(query_between)
for row in cur.fetchall():
print(row)
('Max Verstappen', 55)
NOT BETWEEN
excludes values within the specified range.LIKE
Operator: Basic PatternsLIKE
enables simple pattern matching in text strings.
%
: Matches any sequence (including none) of characters._
: Matches exactly one character.# Starts with 'L'
cur.execute("SELECT driver_name FROM drivers WHERE driver_name LIKE 'L%'")
for row in cur.fetchall(): print(row)
('Lewis Hamilton',)
LIKE
Operator: Case Sensitivity and NOT LIKE
LIKE
is case-insensitive for ASCII by default. Use LOWER()
/UPPER()
for reliable, explicit case handling.# Case-insensitive search for names starting with 'l'
cur.execute("SELECT driver_name FROM drivers WHERE LOWER(driver_name) LIKE 'l%'")
for row in cur.fetchall(): print(row)
('Lewis Hamilton',)
COLLATE NOCASE
to make a specific comparison case-insensitive. This is a more general (and recommended) approach.# Case-insensitive search for names starting with 'l' using COLLATE
cur.execute("SELECT driver_name FROM drivers WHERE driver_name LIKE 'l%' COLLATE NOCASE")
for row in cur.fetchall(): print(row)
('Lewis Hamilton',)
NOT LIKE
to find strings that do not match the pattern.cur.execute()
and fetching/looping.LENGTH()
function might be useful).BETWEEN
).IS NULL
) and COALESCE
NULL
. Use IS NULL
to find them. IS NOT NULL
finds rows with values.cur.execute("SELECT driver_name, victories FROM drivers WHERE victories IS NULL")
for row in cur.fetchall(): print(row)
('Fernando Alonso', None)
('Charles Leclerc', None)
COALESCE(value1, value2, ...)
is useful for replacing NULL
s with a default value. It returns the first non-NULL expression in its arguments.COALESCE
with SubqueriesCOALESCE
can be dynamic, calculated by a subquery.query_coalesce_sub = '''
SELECT
driver_name,
COALESCE(victories,
-- Subquery calculates average victories from non-NULL rows
(SELECT CAST(AVG(victories) AS INTEGER)
FROM drivers
WHERE victories IS NOT NULL)
) AS victories_imputed
FROM drivers;
'''
cur.execute(query_coalesce_sub)
for row in cur.fetchall(): print(row)
('Lewis Hamilton', 103)
('Max Verstappen', 55)
('Fernando Alonso', 79)
('Charles Leclerc', 79)
NULL
victories are replaced by the average of non-NULL victories.GROUP BY
.OVER()
clause.# Add more data for better examples
more_drivers_data = [
('Valtteri Bottas', 'Mercedes', 'Finnish', 10),
('Sergio Perez', 'Red Bull Racing', 'Mexican', 5),
('Lando Norris', 'McLaren', 'British', 2),
('Esteban Ocon', 'Alpine', 'French', 1)
]
# Check if data already exists to avoid duplicates
cur.execute("SELECT COUNT(*) FROM drivers WHERE driver_name = 'Valtteri Bottas'")
if cur.fetchone()[0] == 0:
cur.executemany('INSERT INTO drivers (driver_name, team, nationality, victories) VALUES (?, ?, ?, ?)', more_drivers_data)
conn.commit()
print(f"Added {len(more_drivers_data)} more drivers.")
else:
print("Additional drivers already exist.")
# Check SQLite Version
cur.execute("SELECT sqlite_version();")
print(f"SQLite Version: {cur.fetchone()[0]}")
Added 4 more drivers.
SQLite Version: 3.46.0
AVG
and RANK
AVG(...) OVER ()
: Average over the entire query result.AVG(...) OVER (PARTITION BY col)
: Average within groups (partitions) defined by col
.RANK() OVER (ORDER BY col)
: Assigns rank based on col
(gaps possible for ties).query_window_avg_rank = '''
SELECT
driver_name, team, victories,
ROUND(AVG(victories) OVER (), 2) AS avg_overall,
ROUND(AVG(victories) OVER (PARTITION BY team), 2) AS avg_team,
RANK() OVER (ORDER BY victories DESC) AS rank_overall
FROM drivers
WHERE victories IS NOT NULL
ORDER BY rank_overall;
'''
cur.execute(query_window_avg_rank)
for row in cur.fetchall(): print(row)
('Lewis Hamilton', 'Mercedes', 103, 29.33, 56.5, 1)
('Max Verstappen', 'Red Bull Racing', 55, 29.33, 30.0, 2)
('Valtteri Bottas', 'Mercedes', 10, 29.33, 56.5, 3)
('Sergio Perez', 'Red Bull Racing', 5, 29.33, 30.0, 4)
('Lando Norris', 'McLaren', 2, 29.33, 2.0, 5)
('Esteban Ocon', 'Alpine', 1, 29.33, 1.0, 6)
GROUP BY
GROUP BY
reduces the number of rows to one per group. Window functions maintain all original rows and add new columns based on the window calculation.# GROUP BY example (summarizes)
cur.execute("SELECT team, ROUND(AVG(victories), 2) FROM drivers WHERE victories IS NOT NULL GROUP BY team")
print("--- GROUP BY Output (Collapsed) ---")
for row in cur.fetchall(): print(row)
# Window Function example (adds detail to each row)
cur.execute('''
SELECT driver_name, team, victories,
ROUND(AVG(victories) OVER (PARTITION BY team), 2) as avg_in_team
FROM drivers
WHERE victories IS NOT NULL ORDER BY team, victories DESC
''')
print("\n--- Window Function Output (All Rows) ---")
for row in cur.fetchall(): print(row)
--- GROUP BY Output (Collapsed) ---
('Alpine', 1.0)
('McLaren', 2.0)
('Mercedes', 56.5)
('Red Bull Racing', 30.0)
--- Window Function Output (All Rows) ---
('Esteban Ocon', 'Alpine', 1, 1.0)
('Lando Norris', 'McLaren', 2, 2.0)
('Lewis Hamilton', 'Mercedes', 103, 56.5)
('Valtteri Bottas', 'Mercedes', 10, 56.5)
('Max Verstappen', 'Red Bull Racing', 55, 30.0)
('Sergio Perez', 'Red Bull Racing', 5, 30.0)
driver_name
, nationality
, victories
.rank_nationality
: rank drivers by victories
within each nationality
. (PARTITION BY
needed).NULL
victories.nationality
, then rank_nationality
.cur.execute()
and fetching/looping.LENGTH(str)
, UPPER(str)
, LOWER(str)
.SUBSTR(str, start, length)
extracts a portion of the string (start is 1-indexed).query_str1 = '''
SELECT driver_name,
LENGTH(driver_name) AS len,
UPPER(driver_name) AS upper,
LOWER(driver_name) AS lower,
SUBSTR(driver_name, 1, 4) AS first_four -- Get first 4 characters
FROM drivers LIMIT 4;
'''
cur.execute(query_str1)
for row in cur.fetchall(): print(row)
('Lewis Hamilton', 14, 'LEWIS HAMILTON', 'lewis hamilton', 'Lewi')
('Max Verstappen', 14, 'MAX VERSTAPPEN', 'max verstappen', 'Max ')
('Fernando Alonso', 15, 'FERNANDO ALONSO', 'fernando alonso', 'Fern')
('Charles Leclerc', 15, 'CHARLES LECLERC', 'charles leclerc', 'Char')
TRIM(str)
removes leading/trailing whitespace.REPLACE(str, find, replace)
substitutes text.INSTR(str, find)
finds the starting position of find
within str
(0 if not found).||
operator concatenates (joins) strings.query_str2 = '''
SELECT driver_name,
REPLACE(driver_name, ' ', '_') AS replaced_space,
INSTR(LOWER(driver_name), 'a') AS first_a_pos,
'Driver: ' || driver_name AS labelled_name -- Concatenation
FROM drivers LIMIT 4;
'''
cur.execute(query_str2)
for row in cur.fetchall(): print(row)
('Lewis Hamilton', 'Lewis_Hamilton', 8, 'Driver: Lewis Hamilton')
('Max Verstappen', 'Max_Verstappen', 2, 'Driver: Max Verstappen')
('Fernando Alonso', 'Fernando_Alonso', 5, 'Driver: Fernando Alonso')
('Charles Leclerc', 'Charles_Leclerc', 3, 'Driver: Charles Leclerc')
CASE
StatementCASE
statement is SQL’s way of implementing if-then-else logic within a query.CASE WHEN condition THEN result ELSE default_result END
query_case1 = '''
SELECT driver_name, victories,
CASE
WHEN victories > 50 THEN 'Legend'
ELSE 'Great Driver (or N/A)'
END AS category
FROM drivers;
'''
cur.execute(query_case1)
for row in cur.fetchall(): print(row)
('Lewis Hamilton', 103, 'Legend')
('Max Verstappen', 55, 'Legend')
('Fernando Alonso', None, 'Great Driver (or N/A)')
('Charles Leclerc', None, 'Great Driver (or N/A)')
('Valtteri Bottas', 10, 'Great Driver (or N/A)')
('Sergio Perez', 5, 'Great Driver (or N/A)')
('Lando Norris', 2, 'Great Driver (or N/A)')
('Esteban Ocon', 1, 'Great Driver (or N/A)')
CASE
with Multiple ConditionsWHEN
clauses. The first condition that evaluates to true determines the result for that row. The optional ELSE
handles cases where no WHEN
is true.query_case2 = '''
SELECT driver_name, victories,
CASE
WHEN victories > 100 THEN 'All-Time Great'
WHEN victories > 50 THEN 'Legend'
WHEN victories >= 10 THEN 'Race Winner'
WHEN victories > 0 THEN 'Podium Potential'
ELSE 'Data Missing or Zero Wins'
END AS status
FROM drivers ORDER BY victories DESC NULLS LAST;
'''
cur.execute(query_case2)
for row in cur.fetchall(): print(row)
('Lewis Hamilton', 103, 'All-Time Great')
('Max Verstappen', 55, 'Legend')
('Valtteri Bottas', 10, 'Race Winner')
('Sergio Perez', 5, 'Podium Potential')
('Lando Norris', 2, 'Podium Potential')
('Esteban Ocon', 1, 'Podium Potential')
('Fernando Alonso', None, 'Data Missing or Zero Wins')
('Charles Leclerc', None, 'Data Missing or Zero Wins')
CASE
for Conditional NULL HandlingCASE
offers more nuanced control over handling NULL
s compared to just COALESCE
, allowing checks on other columns.query_case_null = '''
SELECT driver_name,
-- Fill nationality based on name if NULL
CASE
WHEN nationality IS NULL AND driver_name = 'Fernando Alonso' THEN 'Spanish'
WHEN nationality IS NULL THEN 'Unknown'
ELSE nationality
END AS nationality_filled,
-- Fill victories based on team if NULL
CASE
WHEN victories IS NULL AND team = 'Aston Martin' THEN 32 -- Educated guess!
WHEN victories IS NULL THEN 0
ELSE victories
END AS victories_filled
FROM drivers WHERE driver_name LIKE 'F%' OR driver_name LIKE 'L%'; -- Limit output
'''
cur.execute(query_case_null)
for row in cur.fetchall(): print(row)
('Lewis Hamilton', 'British', 103)
('Fernando Alonso', 'Spanish', 32)
('Lando Norris', 'British', 2)
CASE
to create driver_level
:
driver_name
, victories
, driver_level
.cur.execute()
and fetching/looping.pandas
🐼pandas.read_sql
fetchall()
works, but pandas
makes it much easier for analysis.pandas.read_sql()
executes a SELECT
query and loads the results directly into a DataFrame.import pandas as pd # Import pandas
# Pass the SQL query and the connection object
df = pd.read_sql('SELECT * FROM drivers', conn)
print(df)
driver_id driver_name team nationality victories
0 1 Lewis Hamilton Mercedes British 103.0
1 2 Max Verstappen Red Bull Racing Dutch 55.0
2 3 Fernando Alonso Aston Martin None NaN
3 4 Charles Leclerc Ferrari Monégasque NaN
4 5 Valtteri Bottas Mercedes Finnish 10.0
5 6 Sergio Perez Red Bull Racing Mexican 5.0
6 7 Lando Norris McLaren British 2.0
7 8 Esteban Ocon Alpine French 1.0
read_sql
with Any SELECT
QuerySELECT
statements, including joins, filtering, ordering, etc., within read_sql
.query_pd = """
SELECT driver_name, team, victories
FROM drivers
WHERE victories > 5 OR victories IS NULL
ORDER BY team
"""
df_filtered_ordered = pd.read_sql(query_pd, conn)
print(df_filtered_ordered)
driver_name team victories
0 Fernando Alonso Aston Martin NaN
1 Charles Leclerc Ferrari NaN
2 Lewis Hamilton Mercedes 103.0
3 Valtteri Bottas Mercedes 10.0
4 Max Verstappen Red Bull Racing 55.0
pandas
handles retrieving all the data specified by your SQL query.pandas
DataFrame. Apply all your data manipulation skills!to_sql
)DataFrame.to_sql()
writes the DataFrame’s contents into a database table.# Example: Create a DataFrame with British drivers
df_british = df[df['nationality'] == 'British'].copy()
# Write to a new table named 'british_drivers'
# index=False: [Important!]{.alert} Prevents pandas index from becoming a DB column.
df_british.to_sql('british_drivers', conn, if_exists='replace', index=False)
# Verify by reading it back using pandas
print(pd.read_sql('SELECT * FROM british_drivers', conn))
driver_id driver_name team nationality victories
0 1 Lewis Hamilton Mercedes British 103.0
1 7 Lando Norris McLaren British 2.0
employees
table (id
, name
, dept
, salary
) using cursor
.cursor
. Commit.employees
table into df_employees
using pd.read_sql()
. Print it.df_employees
, calculate and print the average salary per department.PIVOT
command, so we use conditional aggregation.student_scores
data first.# Create and populate student_scores table
cur.execute('DROP TABLE IF EXISTS student_scores;')
cur.execute('CREATE TABLE student_scores (student_name TEXT, subject TEXT, score INTEGER, term TEXT);')
scores_data = [
('Alice', 'Maths', 90, 'Q1'), ('Alice', 'Maths', 80, 'Q2'), ('Alice', 'Science', 80, 'Q1'), ('Alice', 'Science', 75, 'Q2'),
('Bob', 'Maths', 80, 'Q1'), ('Bob', 'Maths', 100, 'Q2'), ('Bob', 'Science', 80, 'Q1'), ('Bob', 'Science', 70, 'Q2'),
('Charles', 'Maths', 70, 'Q1'), ('Charles', 'Maths', 75, 'Q2'), ('Charles', 'Science', 90, 'Q1'), ('Charles', 'Science', 85, 'Q2')
]
cur.executemany('INSERT INTO student_scores VALUES (?, ?, ?, ?)', scores_data)
conn.commit()
print("Created and populated student_scores.")
# Display raw data using pandas
print(pd.read_sql('SELECT * FROM student_scores ORDER BY student_name, term, subject', conn))
Created and populated student_scores.
student_name subject score term
0 Alice Maths 90 Q1
1 Alice Science 80 Q1
2 Alice Maths 80 Q2
3 Alice Science 75 Q2
4 Bob Maths 80 Q1
5 Bob Science 80 Q1
6 Bob Maths 100 Q2
7 Bob Science 70 Q2
8 Charles Maths 70 Q1
9 Charles Science 90 Q1
10 Charles Maths 75 Q2
11 Charles Science 85 Q2
CASE
and GROUP BY
AVG
, SUM
, etc.) around a CASE
statement for each desired new column. GROUP BY
ensures one row per student.pivot_query = '''
SELECT
student_name,
-- Avg Maths: Only average score WHEN subject is Maths
AVG(CASE WHEN subject = 'Maths' THEN score END) as Avg_Maths,
-- Avg Science: Only average score WHEN subject is Science
AVG(CASE WHEN subject = 'Science' THEN score END) as Avg_Science
FROM student_scores
GROUP BY student_name
ORDER BY student_name;
'''
# Display the pivoted result using pandas
df_pivot = pd.read_sql(pivot_query, conn)
print(df_pivot)
student_name Avg_Maths Avg_Science
0 Alice 85.0 77.5
1 Bob 90.0 75.0
2 Charles 72.5 87.5
AVG
ignores NULLs, which are produced by the CASE
statement when the subject doesn’t match.sqlite3
provides the basic tools to interact with SQLite from Python.connect()
, cursor()
, execute()
, commit()
.fetchone()
, fetchall()
, or by iterating the cursor.CASE
+ GROUP BY
for pivoting.pandas.read_sql()
greatly simplifies loading data for analysis.df.to_sql()
saves DataFrames back to the database.Source: Susan Ibach (2020)
SQLite connection closed.
Exercise Solutions (Using standard sqlite3
fetching)
import sqlite3
db_file = 'lecture18.db'
conn = sqlite3.connect(db_file)
cur = conn.cursor()
# Find M names
cur.execute("SELECT * FROM drivers WHERE driver_name LIKE 'M%';")
print("--- Drivers starting with M ---")
for row in cur.fetchall(): print(row)
# Find 7 char nationalities
cur.execute("SELECT * FROM drivers WHERE LENGTH(nationality) = 7;")
print("\n--- Nationalities with 7 chars ---")
for row in cur.fetchall(): print(row)
# Find L or M names
cur.execute("SELECT * FROM drivers WHERE driver_name LIKE 'L%' OR driver_name LIKE 'M%';")
print("\n--- Drivers starting with L or M ---")
for row in cur.fetchall(): print(row)
# Find 1-10 wins
cur.execute("SELECT * FROM drivers WHERE victories BETWEEN 1 AND 10;")
print("\n--- Drivers with 1-10 wins ---")
for row in cur.fetchall(): print(row)
conn.close()
--- Drivers starting with M ---
(2, 'Max Verstappen', 'Red Bull Racing', 'Dutch', 55)
--- Nationalities with 7 chars ---
(1, 'Lewis Hamilton', 'Mercedes', 'British', 103)
(5, 'Valtteri Bottas', 'Mercedes', 'Finnish', 10)
(6, 'Sergio Perez', 'Red Bull Racing', 'Mexican', 5)
(7, 'Lando Norris', 'McLaren', 'British', 2)
--- Drivers starting with L or M ---
(1, 'Lewis Hamilton', 'Mercedes', 'British', 103)
(2, 'Max Verstappen', 'Red Bull Racing', 'Dutch', 55)
(7, 'Lando Norris', 'McLaren', 'British', 2)
--- Drivers with 1-10 wins ---
(5, 'Valtteri Bottas', 'Mercedes', 'Finnish', 10)
(6, 'Sergio Perez', 'Red Bull Racing', 'Mexican', 5)
(7, 'Lando Norris', 'McLaren', 'British', 2)
(8, 'Esteban Ocon', 'Alpine', 'French', 1)
Exercise Solution (Using standard sqlite3
fetching)
import sqlite3
db_file = 'lecture18.db'
conn = sqlite3.connect(db_file)
cur = conn.cursor()
# Query and display using standard fetch
query_rank_nationality = '''
SELECT
driver_name, nationality, victories,
RANK() OVER (PARTITION BY nationality ORDER BY victories DESC) AS rank_nationality
FROM drivers
WHERE victories IS NOT NULL
ORDER BY nationality, rank_nationality;
'''
cur.execute(query_rank_nationality)
print("\n--- Drivers ranked by victories within nationality ---")
for row in cur.fetchall(): print(row)
conn.close()
--- Drivers ranked by victories within nationality ---
('Lewis Hamilton', 'British', 103, 1)
('Lando Norris', 'British', 2, 2)
('Max Verstappen', 'Dutch', 55, 1)
('Valtteri Bottas', 'Finnish', 10, 1)
('Esteban Ocon', 'French', 1, 1)
('Sergio Perez', 'Mexican', 5, 1)
Exercise Solution (Using standard sqlite3
fetching)
import sqlite3
db_file = 'lecture18.db'
conn = sqlite3.connect(db_file)
cur = conn.cursor()
query_driver_level = '''
SELECT
driver_name, victories,
CASE
WHEN victories IS NULL THEN 'Unknown'
WHEN victories > 50 THEN 'Expert'
WHEN victories BETWEEN 10 AND 50 THEN 'Intermediate'
WHEN victories < 10 THEN 'Beginner'
END AS driver_level
FROM drivers
ORDER BY victories DESC NULLS LAST;
'''
cur.execute(query_driver_level)
print("--- Driver classification using CASE ---")
for row in cur.fetchall(): print(row)
conn.close()
--- Driver classification using CASE ---
('Lewis Hamilton', 103, 'Expert')
('Max Verstappen', 55, 'Expert')
('Valtteri Bottas', 10, 'Intermediate')
('Sergio Perez', 5, 'Beginner')
('Lando Norris', 2, 'Beginner')
('Esteban Ocon', 1, 'Beginner')
('Fernando Alonso', None, 'Unknown')
('Charles Leclerc', None, 'Unknown')
Exercise Solution (Using pandas
as requested)
import pandas as pd
import sqlite3
import os
db_file = 'lecture18.db'
conn = sqlite3.connect(db_file)
cur = conn.cursor()
# 1. Create employees table
cur.execute('DROP TABLE IF EXISTS employees;')
cur.execute('''
CREATE TABLE employees (
employee_id INTEGER PRIMARY KEY AUTOINCREMENT,
employee_name TEXT, department TEXT, salary INTEGER
);''')
# 2. Insert data
employee_data = [
('Alice', 'HR', 50000), ('Bob', 'IT', 60000),
('Charlie', 'HR', 70000), ('David', 'IT', 80000)
]
cur.executemany('INSERT INTO employees (employee_name, department, salary) VALUES (?, ?, ?)', employee_data)
conn.commit()
print("Employees table created and populated.")
# --- Pandas part starts here ---
# 3. Read data with pandas
df_employees = pd.read_sql('SELECT * FROM employees', conn)
print("\n--- Employees DataFrame ---")
print(df_employees)
# 4. Compute average salary by department using pandas
avg_salary_dept = df_employees.groupby('department')['salary'].mean()
print("\n--- Average salary by department (pandas) ---")
print(avg_salary_dept)
conn.close()
Employees table created and populated.
--- Employees DataFrame ---
employee_id employee_name department salary
0 1 Alice HR 50000
1 2 Bob IT 60000
2 3 Charlie HR 70000
3 4 David IT 80000
--- Average salary by department (pandas) ---
department
HR 60000.0
IT 70000.0
Name: salary, dtype: float64
.db
file (lecture18.db
) or use code:import sqlite3
db_file_to_clean = 'lecture18.db'
conn_clean = sqlite3.connect(db_file_to_clean)
cur_clean = conn_clean.cursor()
cur_clean.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';")
tables = cur_clean.fetchall()
print(f"Tables to drop: {tables}")
for table_name_tuple in tables:
cur_clean.execute(f"DROP TABLE IF EXISTS {table_name_tuple[0]}")
print(f"Dropped table: {table_name_tuple[0]}")
conn_clean.commit()
conn_clean.close()
print("Finished dropping tables.")