from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH

doc = Document()

# Title
title = doc.add_heading('SQL Query Interview Examples', 0)
title.alignment = WD_ALIGN_PARAGRAPH.CENTER

# Helper function to add code blocks
def add_code(doc, code_text):
    p = doc.add_paragraph()
    run = p.add_run(code_text)
    run.font.name = 'Courier New'
    run.font.size = Pt(9)
    p.paragraph_format.left_indent = Inches(0.25)
    return p

def add_bold(doc, text):
    p = doc.add_paragraph()
    run = p.add_run(text)
    run.bold = True
    return p

# ============================================================
# EXAMPLE 1
# ============================================================
doc.add_heading('EXAMPLE 1', level=1)
doc.add_paragraph('Given two tables:')
doc.add_paragraph('Transactions - transaction_id, customer_id, date, product_id, quantity', style='List Bullet')
doc.add_paragraph('Product - product_id, product_category, unit_price', style='List Bullet')

# Q1
doc.add_heading('Q1 - Top 3 product categories by revenue last month', level=2)
doc.add_paragraph('Clarify: What defines "last month"? Calendar month or rolling 30 days?')
add_bold(doc, 'Approach:')
doc.add_paragraph('JOIN Transactions → Product on product_id', style='List Bullet')
doc.add_paragraph('Filter to last month', style='List Bullet')
doc.add_paragraph('Calculate revenue: quantity * unit_price', style='List Bullet')
doc.add_paragraph('GROUP BY category, SUM revenue, ORDER DESC, LIMIT 3', style='List Bullet')

add_code(doc, '''SELECT
    p.product_category,
    SUM(t.quantity * p.unit_price) AS total_revenue
FROM Transactions t
JOIN Product p ON t.product_id = p.product_id
WHERE t.date >= DATE_FORMAT(CURDATE() - INTERVAL 1 MONTH, '%Y-%m-01')
  AND t.date < DATE_FORMAT(CURDATE(), '%Y-%m-01')
GROUP BY p.product_category
ORDER BY total_revenue DESC
LIMIT 3;''')

# Q2
doc.add_heading('Q2 - Customer spend % by category', level=2)
doc.add_paragraph('Clarify: Percentages should sum to 100% per customer, correct?')
add_bold(doc, 'Approach:')
doc.add_paragraph('JOIN tables to get customer + category + spend', style='List Bullet')
doc.add_paragraph('Use window function to get customer total', style='List Bullet')
doc.add_paragraph('Calculate percentage', style='List Bullet')

add_code(doc, '''WITH category_spend AS (
    SELECT
        t.customer_id,
        p.product_category,
        SUM(t.quantity * p.unit_price) AS spend
    FROM Transactions t
    JOIN Product p ON t.product_id = p.product_id
    GROUP BY t.customer_id, p.product_category
)
SELECT
    customer_id,
    product_category,
    spend,
    ROUND(100.0 * spend / SUM(spend) OVER (PARTITION BY customer_id), 2) AS pct_of_total
FROM category_spend
ORDER BY customer_id, pct_of_total DESC;''')

# Q3
doc.add_heading('Q3 - Month-over-month growth by customer', level=2)
doc.add_paragraph('Clarify: Growth as percentage change? How to handle first month (no prior)?')
add_bold(doc, 'Approach:')
doc.add_paragraph('Aggregate revenue by customer + month', style='List Bullet')
doc.add_paragraph('Use LAG() to get prior month', style='List Bullet')
doc.add_paragraph('Calculate (current - prior) / prior * 100', style='List Bullet')

add_code(doc, '''WITH monthly_revenue AS (
    SELECT
        t.customer_id,
        DATE_FORMAT(t.date, '%Y-%m-01') AS month,
        SUM(t.quantity * p.unit_price) AS revenue
    FROM Transactions t
    JOIN Product p ON t.product_id = p.product_id
    GROUP BY t.customer_id, DATE_FORMAT(t.date, '%Y-%m-01')
)
SELECT
    customer_id,
    month,
    revenue,
    LAG(revenue) OVER (PARTITION BY customer_id ORDER BY month) AS prev_revenue,
    ROUND(
        100.0 * (revenue - LAG(revenue) OVER (PARTITION BY customer_id ORDER BY month)) /
        NULLIF(LAG(revenue) OVER (PARTITION BY customer_id ORDER BY month), 0),
        2
    ) AS mom_growth_pct
FROM monthly_revenue
ORDER BY customer_id, month;''')

# ============================================================
# EXAMPLE 2
# ============================================================
doc.add_heading('EXAMPLE 2', level=1)
doc.add_paragraph('Given tables:')
doc.add_paragraph('Product - product_category, product_id, price', style='List Bullet')
doc.add_paragraph('Transactions - customer_id, date, product_id, quantity', style='List Bullet')
doc.add_paragraph('Customer - customer_id, name, address, region, account_type', style='List Bullet')

# Q1
doc.add_heading('Q1 - Top 3 categories by quantity sold in 2019', level=2)
doc.add_paragraph('Approach: Similar to Example 1 Q1, but aggregate quantity instead of revenue.')

add_code(doc, '''SELECT
    p.product_category,
    SUM(t.quantity) AS total_quantity
FROM Transactions t
JOIN Product p ON t.product_id = p.product_id
WHERE t.date >= '2019-01-01' AND t.date < '2020-01-01'
GROUP BY p.product_category
ORDER BY total_quantity DESC
LIMIT 3;''')

# Q2
doc.add_heading('Q2 - Top 5 customers per region in top 3 categories', level=2)
doc.add_paragraph('Clarify: Top 5 per region or top 5 overall filtered by region?')
add_bold(doc, 'Approach:')
doc.add_paragraph('Use Q1 result as subquery/CTE', style='List Bullet')
doc.add_paragraph('Filter transactions to those categories', style='List Bullet')
doc.add_paragraph('JOIN Customer for region', style='List Bullet')
doc.add_paragraph('RANK within each region, filter to top 5', style='List Bullet')

add_code(doc, '''WITH top_categories AS (
    SELECT p.product_category
    FROM Transactions t
    JOIN Product p ON t.product_id = p.product_id
    WHERE t.date >= '2019-01-01' AND t.date < '2020-01-01'
    GROUP BY p.product_category
    ORDER BY SUM(t.quantity) DESC
    LIMIT 3
),
customer_sales AS (
    SELECT
        c.customer_id,
        c.region,
        SUM(t.quantity) AS total_qty
    FROM Transactions t
    JOIN Product p ON t.product_id = p.product_id
    JOIN Customer c ON t.customer_id = c.customer_id
    WHERE p.product_category IN (SELECT product_category FROM top_categories)
      AND c.region IN ('US', 'EMEA', 'APAC')
    GROUP BY c.customer_id, c.region
),
ranked AS (
    SELECT
        customer_id,
        region,
        total_qty,
        ROW_NUMBER() OVER (PARTITION BY region ORDER BY total_qty DESC) AS rn
    FROM customer_sales
)
SELECT customer_id, region, total_qty
FROM ranked
WHERE rn <= 5
ORDER BY region, rn;''')

# ============================================================
# EXAMPLE 3
# ============================================================
doc.add_heading('EXAMPLE 3', level=1)
doc.add_paragraph('Given table:')
doc.add_paragraph('Orders - marketplace_id, order_id, customer_id, item, units, order_date', style='List Bullet')

# Q1
doc.add_heading('Q1 - % of 2021 orders that were in Q1 2021', level=2)
doc.add_paragraph('Approach: Count Q1 orders, divide by total 2021 orders.')

add_code(doc, '''SELECT
    ROUND(
        100.0 * SUM(CASE WHEN order_date >= '2021-01-01' AND order_date < '2021-04-01' THEN 1 ELSE 0 END) /
        COUNT(*),
        2
    ) AS q1_pct_of_total
FROM Orders
WHERE order_date >= '2021-01-01' AND order_date < '2022-01-01';''')

# Q2
doc.add_heading('Q2 - Top 10 items from first purchases in 2021 (US)', level=2)
doc.add_paragraph('Clarify: "First purchase" = customer\'s first order ever, or first in 2021?')
add_bold(doc, 'Approach:')
doc.add_paragraph('Find each customer\'s first order date in 2021 (US)', style='List Bullet')
doc.add_paragraph('Get items from those orders', style='List Bullet')
doc.add_paragraph('Rank by frequency', style='List Bullet')

add_code(doc, '''WITH first_orders AS (
    SELECT customer_id, MIN(order_date) AS first_date
    FROM Orders
    WHERE order_date >= '2021-01-01' AND order_date < '2022-01-01'
      AND marketplace_id = 'US'
    GROUP BY customer_id
)
SELECT
    o.item,
    COUNT(*) AS times_sold
FROM Orders o
JOIN first_orders fo
    ON o.customer_id = fo.customer_id
    AND o.order_date = fo.first_date
WHERE o.marketplace_id = 'US'
GROUP BY o.item
ORDER BY times_sold DESC
LIMIT 10;''')

# ============================================================
# OTHER EXAMPLES
# ============================================================
doc.add_heading('Other Examples', level=1)

# Unique customers & avg sales
doc.add_heading('Unique customers & average sales (Nov 2021)', level=2)

add_code(doc, '''SELECT
    COUNT(DISTINCT customer_id) AS unique_customers,
    AVG(sales_amount) AS avg_sales
FROM Orders
WHERE order_date >= '2021-11-01' AND order_date < '2021-12-01';''')

# Avg of avgs
doc.add_heading('Average of customer averages (Nov 2021)', level=2)

add_code(doc, '''WITH customer_avg AS (
    SELECT customer_id, AVG(sales_amount) AS avg_sales
    FROM Orders
    WHERE order_date >= '2021-11-01' AND order_date < '2021-12-01'
    GROUP BY customer_id
)
SELECT AVG(avg_sales) AS avg_of_avg_sales
FROM customer_avg;''')

# Why different
doc.add_heading('Why are they different? (Conceptual)', level=2)
doc.add_paragraph(
    'The simple average weights each transaction equally. The average of averages weights each customer equally.'
)
doc.add_paragraph(
    'If $50 < $60, it means high-volume customers have lower average order values. '
    'A few customers placing many small orders pull down the simple average, '
    'but in the average-of-averages, they only count once.'
)
add_bold(doc, 'Key insight: This reveals customer behavior skew—some customers order frequently with smaller amounts.')

# Histogram
doc.add_heading('Histogram of order counts (including zeros)', level=2)
doc.add_paragraph('Approach: Need all customers, LEFT JOIN to Nov orders, count orders, then group by count.')

add_code(doc, '''WITH nov_orders AS (
    SELECT customer_id, COUNT(*) AS order_count
    FROM Orders
    WHERE order_date >= '2021-11-01' AND order_date < '2021-12-01'
    GROUP BY customer_id
),
all_customers AS (
    SELECT
        c.customer_id,
        COALESCE(n.order_count, 0) AS order_count
    FROM Customers c
    LEFT JOIN nov_orders n ON c.customer_id = n.customer_id
)
SELECT
    order_count,
    COUNT(*) AS customer_count
FROM all_customers
GROUP BY order_count
ORDER BY order_count;''')

# Week with largest change
doc.add_heading('Week with largest WoW customer change', level=2)
add_bold(doc, 'Approach:')
doc.add_paragraph('Count distinct customers per week', style='List Bullet')
doc.add_paragraph('Use LAG() for prior week', style='List Bullet')
doc.add_paragraph('Calculate absolute change, find max', style='List Bullet')

add_code(doc, '''WITH weekly_customers AS (
    SELECT
        DATE(order_date - INTERVAL WEEKDAY(order_date) DAY + INTERVAL 6 DAY) AS week_ending,
        COUNT(DISTINCT customer_id) AS customer_count
    FROM Orders
    WHERE order_date >= CURDATE() - INTERVAL 1 YEAR
    GROUP BY week_ending
),
with_change AS (
    SELECT
        week_ending,
        customer_count,
        LAG(customer_count) OVER (ORDER BY week_ending) AS prev_count,
        ABS(customer_count - LAG(customer_count) OVER (ORDER BY week_ending)) AS abs_change
    FROM weekly_customers
)
SELECT week_ending, customer_count, prev_count, abs_change
FROM with_change
ORDER BY abs_change DESC
LIMIT 1;''')

# ============================================================
# INTERVIEW FRAMEWORK
# ============================================================
doc.add_heading('Interview Framework Summary', level=1)

table = doc.add_table(rows=7, cols=2)
table.style = 'Table Grid'

# Header row
hdr_cells = table.rows[0].cells
hdr_cells[0].text = 'Step'
hdr_cells[1].text = 'What to Do'

data = [
    ('Clarify', 'Restate the problem, ask about edge cases'),
    ('Identify', 'Tables, joins, filters needed'),
    ('Decompose', 'Break into CTEs or logical steps'),
    ('Write', 'Start simple, add complexity'),
    ('Validate', 'Talk through edge cases (NULLs, zeros, ties)'),
    ('Optimize', 'Mention indexes, alternatives if asked'),
]

for i, (step, action) in enumerate(data, start=1):
    row_cells = table.rows[i].cells
    row_cells[0].text = step
    row_cells[1].text = action

# Save
doc.save('/home/rosy/public/sql_interview_guide.docx')
print("Document created successfully!")
