#!/usr/bin/env python3
from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.style import WD_STYLE_TYPE

doc = Document()

# Title
title = doc.add_heading('SQL Interview Questions: A Complete Guide', 0)
title.alignment = WD_ALIGN_PARAGRAPH.CENTER

# ============ PART 1: CONCEPT QUESTIONS ============
doc.add_heading('Part 1: SQL Concept Questions', level=1)
doc.add_paragraph('The key: Be concise, give a real example, and mention a gotcha or edge case.')

# Q1 - Joins
doc.add_heading('Q1 - Explain the Various Types of Joins', level=2)
doc.add_paragraph('Framework: Define → Visual mental model → Use case')
doc.add_paragraph(
    '"Joins combine rows from two tables based on a related column."'
).italic = True

bullet_points = [
    ('INNER JOIN', 'Only matching rows from both tables'),
    ('LEFT JOIN', 'All rows from left table + matches from right (NULLs where no match)'),
    ('RIGHT JOIN', 'Opposite of LEFT'),
    ('FULL OUTER JOIN', 'All rows from both, NULLs where no match'),
    ('CROSS JOIN', 'Cartesian product - every row paired with every row'),
]
for term, desc in bullet_points:
    p = doc.add_paragraph(style='List Bullet')
    p.add_run(term + ': ').bold = True
    p.add_run(desc)

gotcha = doc.add_paragraph()
gotcha.add_run('Gotcha: ').bold = True
gotcha.add_run('LEFT JOIN with a WHERE clause on the right table often behaves like an INNER JOIN unless you check for NULL.')

# Q2 - UNION vs UNION ALL
doc.add_heading('Q2 - What is the Difference Between UNION and UNION ALL?', level=2)
doc.add_paragraph('"Both combine result sets vertically."').italic = True

bullet_points = [
    ('UNION', 'Removes duplicates (slower - requires sorting/hashing)'),
    ('UNION ALL', 'Keeps all rows (faster)'),
]
for term, desc in bullet_points:
    p = doc.add_paragraph(style='List Bullet')
    p.add_run(term + ': ').bold = True
    p.add_run(desc)

best = doc.add_paragraph()
best.add_run('Best practice: ').bold = True
best.add_run('Use UNION ALL when you know there are no duplicates or duplicates are acceptable.')

# Q3 - WHERE vs HAVING
doc.add_heading('Q3 - What is the Difference Between WHERE and HAVING?', level=2)
p = doc.add_paragraph()
p.add_run('WHERE').bold = True
p.add_run(' filters rows ')
p.add_run('before').italic = True
p.add_run(' aggregation. ')
p.add_run('HAVING').bold = True
p.add_run(' filters ')
p.add_run('after').italic = True
p.add_run(' aggregation.')

doc.add_paragraph('Example: "Find categories with total sales > $1000" needs HAVING because you filter on SUM().')

# Q4 - DIST Key vs SORT Key
doc.add_heading('Q4 - What is the Difference Between DIST Key and SORT Key?', level=2)
doc.add_paragraph('"Both optimize query performance in columnar databases like Redshift."').italic = True

bullet_points = [
    ('DIST Key', 'Controls how data is distributed across nodes (optimize for JOINs)'),
    ('SORT Key', 'Controls row ordering within each node (optimize for range filters and ORDER BY)'),
]
for term, desc in bullet_points:
    p = doc.add_paragraph(style='List Bullet')
    p.add_run(term + ': ').bold = True
    p.add_run(desc)

# Q5 - LAG vs LEAD
doc.add_heading('Q5 - What is the Difference Between LAG and LEAD Functions?', level=2)
doc.add_paragraph('"Window functions that access other rows relative to current row."').italic = True

bullet_points = [
    ('LAG', 'Looks backward (previous row)'),
    ('LEAD', 'Looks forward (next row)'),
]
for term, desc in bullet_points:
    p = doc.add_paragraph(style='List Bullet')
    p.add_run(term + ': ').bold = True
    p.add_run(desc)

doc.add_paragraph('Common use: Month-over-month comparisons.')

# ============ PART 2: QUERY QUESTIONS FRAMEWORK ============
doc.add_heading('Part 2: SQL Query Questions - Framework', level=1)

doc.add_paragraph('Framework for any SQL interview question:')

steps = [
    ('Clarify', 'Ask about edge cases, date ranges, tie-breaking'),
    ('Break it down', 'Identify the steps (often maps to CTEs)'),
    ('Write incrementally', 'Build and verify each piece'),
    ('Optimize', 'Mention indexes, explain choices'),
]
for i, (step, desc) in enumerate(steps, 1):
    p = doc.add_paragraph()
    p.add_run(f'{i}. {step} - ').bold = True
    p.add_run(desc)

# ============ EXAMPLE 1 ============
doc.add_heading('Example 1: Transactions & Products', level=1)
doc.add_paragraph('Given two tables:')
doc.add_paragraph('• Transactions: transaction_id, customer_id, date, product_id, quantity', style='List Bullet')
doc.add_paragraph('• Product: product_id, product_category, unit_price', style='List Bullet')

# Q1
doc.add_heading('Q1: Top 3 Product Categories by Revenue Last Month', level=2)
code1 = '''SELECT
    p.product_category,
    SUM(t.quantity * p.unit_price) AS total_revenue
FROM Transactions t
JOIN Product p ON t.product_id = p.product_id
WHERE t.date >= DATE_FORMAT(CURDATE() - INTERVAL 1 MONTH, '%Y-%m-01')
  AND t.date < DATE_FORMAT(CURDATE(), '%Y-%m-01')
GROUP BY p.product_category
ORDER BY total_revenue DESC
LIMIT 3;'''

p = doc.add_paragraph()
run = p.add_run(code1)
run.font.name = 'Courier New'
run.font.size = Pt(9)

talk = doc.add_paragraph()
talk.add_run('Talk through it: ').bold = True
talk.add_run('"I join to get price, filter to last month, aggregate by category, sort descending, limit 3."')

# Q2
doc.add_heading('Q2: Customer Spend % by Category', level=2)
code2 = '''WITH customer_category_spend AS (
    SELECT
        t.customer_id,
        p.product_category,
        SUM(t.quantity * p.unit_price) AS category_spend
    FROM Transactions t
    JOIN Product p ON t.product_id = p.product_id
    GROUP BY t.customer_id, p.product_category
),
customer_total AS (
    SELECT
        customer_id,
        SUM(category_spend) AS total_spend
    FROM customer_category_spend
    GROUP BY customer_id
)
SELECT
    ccs.customer_id,
    ccs.product_category,
    ROUND(100.0 * ccs.category_spend / ct.total_spend, 2) AS pct_of_total
FROM customer_category_spend ccs
JOIN customer_total ct ON ccs.customer_id = ct.customer_id
ORDER BY ccs.customer_id, pct_of_total DESC;'''

p = doc.add_paragraph()
run = p.add_run(code2)
run.font.name = 'Courier New'
run.font.size = Pt(9)

# Q3
doc.add_heading('Q3: Month-over-Month Growth by Customer', level=2)
code3 = '''WITH monthly_sales AS (
    SELECT
        t.customer_id,
        DATE_FORMAT(t.date, '%Y-%m-01') AS month,
        SUM(t.quantity * p.unit_price) AS revenue
    FROM Transactions t
    JOIN Product p ON t.product_id = p.product_id
    GROUP BY t.customer_id, DATE_FORMAT(t.date, '%Y-%m-01')
)
SELECT
    customer_id,
    month,
    revenue,
    LAG(revenue) OVER (PARTITION BY customer_id ORDER BY month) AS prev_month,
    ROUND(100.0 * (revenue - LAG(revenue) OVER (
        PARTITION BY customer_id ORDER BY month))
        / NULLIF(LAG(revenue) OVER (
            PARTITION BY customer_id ORDER BY month), 0), 2) AS mom_growth_pct
FROM monthly_sales
ORDER BY customer_id, month;'''

p = doc.add_paragraph()
run = p.add_run(code3)
run.font.name = 'Courier New'
run.font.size = Pt(9)

doc.add_paragraph('Key points to mention:')
doc.add_paragraph('Use NULLIF to avoid divide-by-zero', style='List Bullet')
doc.add_paragraph('LAG with PARTITION BY for per-customer comparison', style='List Bullet')

# ============ EXAMPLE 2 ============
doc.add_heading('Example 2: Products, Transactions & Customers', level=1)
doc.add_paragraph('Given three tables:')
doc.add_paragraph('• Product: product_category, product_id, price', style='List Bullet')
doc.add_paragraph('• Transactions: customer_id, date, product_id, quantity', style='List Bullet')
doc.add_paragraph('• Customer: customer_id, name, address, region, account_type', style='List Bullet')

doc.add_heading('Q1: Top 3 Categories by Quantity in 2019', level=2)
code4 = '''SELECT
    p.product_category,
    SUM(t.quantity) AS total_quantity
FROM Transactions t
JOIN Product p ON t.product_id = p.product_id
WHERE YEAR(t.date) = 2019
GROUP BY p.product_category
ORDER BY total_quantity DESC
LIMIT 3;'''

p = doc.add_paragraph()
run = p.add_run(code4)
run.font.name = 'Courier New'
run.font.size = Pt(9)

doc.add_heading('Q2: Top 5 Customers per Region in Top 3 Categories', level=2)
code5 = '''WITH top_categories AS (
    SELECT p.product_category
    FROM Transactions t
    JOIN Product p ON t.product_id = p.product_id
    WHERE YEAR(t.date) = 2019
    GROUP BY p.product_category
    ORDER BY SUM(t.quantity) DESC
    LIMIT 3
),
customer_sales AS (
    SELECT
        c.customer_id,
        c.region,
        SUM(t.quantity) AS total_qty
    FROM Transactions t
    JOIN Product p ON t.product_id = p.product_id
    JOIN Customer c ON t.customer_id = c.customer_id
    WHERE p.product_category IN (SELECT product_category FROM top_categories)
      AND c.region IN ('US', 'EMEA', 'APAC')
    GROUP BY c.customer_id, c.region
),
ranked AS (
    SELECT
        customer_id,
        region,
        total_qty,
        ROW_NUMBER() OVER (PARTITION BY region ORDER BY total_qty DESC) AS rn
    FROM customer_sales
)
SELECT customer_id, region, total_qty
FROM ranked
WHERE rn <= 5
ORDER BY region, rn;'''

p = doc.add_paragraph()
run = p.add_run(code5)
run.font.name = 'Courier New'
run.font.size = Pt(9)

# ============ EXAMPLE 3 ============
doc.add_heading('Example 3: Orders Analysis', level=1)
doc.add_paragraph('Given table:')
doc.add_paragraph('• Orders: marketplace_id, order_id, customer_id, item, units, order_date', style='List Bullet')

doc.add_heading('Q1: Percentage of 2021 Orders in Q1', level=2)
code6 = '''SELECT
    ROUND(100.0 * SUM(CASE
        WHEN order_date BETWEEN '2021-01-01' AND '2021-03-31'
        THEN 1 ELSE 0 END) / COUNT(*), 2) AS q1_pct
FROM Orders
WHERE YEAR(order_date) = 2021;'''

p = doc.add_paragraph()
run = p.add_run(code6)
run.font.name = 'Courier New'
run.font.size = Pt(9)

doc.add_heading('Q2: Top 10 Items from First Purchases in 2021 (US)', level=2)
code7 = '''WITH first_orders AS (
    SELECT customer_id, MIN(order_date) AS first_date
    FROM Orders
    WHERE YEAR(order_date) = 2021
      AND marketplace_id = 'US'
    GROUP BY customer_id
)
SELECT o.item, COUNT(*) AS times_sold
FROM Orders o
JOIN first_orders fo
    ON o.customer_id = fo.customer_id
    AND o.order_date = fo.first_date
WHERE o.marketplace_id = 'US'
GROUP BY o.item
ORDER BY times_sold DESC
LIMIT 10;'''

p = doc.add_paragraph()
run = p.add_run(code7)
run.font.name = 'Courier New'
run.font.size = Pt(9)

# ============ OTHER EXAMPLES ============
doc.add_heading('Other Common Questions', level=1)

doc.add_heading('Unique Customers & Average Sales (Nov 2021)', level=2)
code8 = '''SELECT
    COUNT(DISTINCT customer_id) AS unique_customers,
    AVG(sales_amount) AS avg_sales
FROM Orders
WHERE order_date BETWEEN '2021-11-01' AND '2021-11-30';'''

p = doc.add_paragraph()
run = p.add_run(code8)
run.font.name = 'Courier New'
run.font.size = Pt(9)

doc.add_heading('Average of Customer Averages', level=2)
code9 = '''WITH customer_avgs AS (
    SELECT customer_id, AVG(sales_amount) AS avg_sales
    FROM Orders
    WHERE order_date BETWEEN '2021-11-01' AND '2021-11-30'
    GROUP BY customer_id
)
SELECT AVG(avg_sales) AS avg_of_avgs
FROM customer_avgs;'''

p = doc.add_paragraph()
run = p.add_run(code9)
run.font.name = 'Courier New'
run.font.size = Pt(9)

doc.add_heading('Why Average ≠ Average of Averages?', level=2)
p = doc.add_paragraph()
p.add_run('Statistical insight: ').bold = True
p.add_run('The difference indicates ')
p.add_run('unequal purchase behavior').bold = True
p.add_run(' across customers. Simple average weights all transactions equally. Average of averages weights all ')
p.add_run('customers').italic = True
p.add_run(' equally.')

doc.add_paragraph('If simple avg ($50) < avg of avgs ($60), it means high-volume customers spend less per transaction than low-volume customers.')

doc.add_heading('Histogram: Customer Order Counts (Including Zero)', level=2)
code10 = '''WITH all_customers AS (
    SELECT DISTINCT customer_id FROM Customers
),
nov_orders AS (
    SELECT customer_id, COUNT(DISTINCT order_id) AS order_count
    FROM Orders
    WHERE order_date BETWEEN '2021-11-01' AND '2021-11-30'
    GROUP BY customer_id
)
SELECT
    COALESCE(no.order_count, 0) AS order_count,
    COUNT(*) AS customer_count
FROM all_customers ac
LEFT JOIN nov_orders no ON ac.customer_id = no.customer_id
GROUP BY COALESCE(no.order_count, 0)
ORDER BY order_count;'''

p = doc.add_paragraph()
run = p.add_run(code10)
run.font.name = 'Courier New'
run.font.size = Pt(9)

doc.add_heading('Week with Largest Customer Change vs Prior Week', level=2)
code11 = '''WITH weekly_customers AS (
    SELECT
        DATE_SUB(order_date,
            INTERVAL WEEKDAY(order_date) DAY) + INTERVAL 6 DAY AS week_end,
        COUNT(DISTINCT customer_id) AS customers
    FROM Orders
    WHERE order_date >= CURDATE() - INTERVAL 1 YEAR
    GROUP BY week_end
)
SELECT
    week_end,
    customers,
    LAG(customers) OVER (ORDER BY week_end) AS prev_week,
    ABS(customers - LAG(customers) OVER (ORDER BY week_end)) AS abs_change
FROM weekly_customers
ORDER BY abs_change DESC
LIMIT 1;'''

p = doc.add_paragraph()
run = p.add_run(code11)
run.font.name = 'Courier New'
run.font.size = Pt(9)

# ============ INTERVIEW TIPS ============
doc.add_heading('General Interview Tips', level=1)

# Create table
table = doc.add_table(rows=6, cols=2)
table.style = 'Table Grid'

# Header
table.rows[0].cells[0].text = 'DO'
table.rows[0].cells[1].text = "DON'T"
table.rows[0].cells[0].paragraphs[0].runs[0].bold = True
table.rows[0].cells[1].paragraphs[0].runs[0].bold = True

tips = [
    ('Think out loud', 'Write in silence'),
    ('Ask clarifying questions', 'Assume edge cases'),
    ('Use CTEs for readability', 'Write one massive query'),
    ('Mention tradeoffs', 'Only give one solution'),
    ('Test with edge cases mentally', 'Assume it works'),
]

for i, (do, dont) in enumerate(tips, 1):
    table.rows[i].cells[0].text = do
    table.rows[i].cells[1].text = dont

# Save
doc.save('/home/rosy/public/SQL_Interview_Guide.docx')
print("Document created successfully!")
