-- ============================================================================
-- SQL Tips, Tricks & Examples for the Practice Database
-- ============================================================================
-- This file contains useful SQL patterns, techniques, and examples
-- using the logistics/e-commerce practice database.
-- ============================================================================

-- ============================================================================
-- 1. DATE MANIPULATION
-- ============================================================================

-- Get first/last day of month
SELECT
    DATE_FORMAT(NOW(), '%Y-%m-01') AS first_of_month,
    LAST_DAY(NOW()) AS last_of_month;

-- Extract parts of a date
SELECT
    order_date,
    YEAR(order_date) AS year,
    MONTH(order_date) AS month,
    DAY(order_date) AS day,
    DAYNAME(order_date) AS day_name,
    WEEK(order_date) AS week_number,
    QUARTER(order_date) AS quarter
FROM orders
LIMIT 5;

-- Group by month (two ways)
SELECT DATE_FORMAT(order_date, '%Y-%m') AS month, COUNT(*) AS orders
FROM orders GROUP BY DATE_FORMAT(order_date, '%Y-%m');

SELECT YEAR(order_date) AS year, MONTH(order_date) AS month, COUNT(*) AS orders
FROM orders GROUP BY YEAR(order_date), MONTH(order_date);

-- Date arithmetic
SELECT
    order_date,
    order_date + INTERVAL 7 DAY AS plus_week,
    order_date - INTERVAL 1 MONTH AS minus_month,
    DATEDIFF(delivery_date, order_date) AS days_to_deliver
FROM orders
WHERE delivery_date IS NOT NULL
LIMIT 5;

-- ============================================================================
-- 2. PERCENTAGE CALCULATIONS
-- ============================================================================

-- Percentage of total (using subquery)
SELECT
    region,
    COUNT(*) AS customer_count,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM customers), 2) AS pct_of_total
FROM customers
GROUP BY region;

-- Percentage of total (using window function - more efficient)
SELECT
    region,
    COUNT(*) AS customer_count,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) AS pct_of_total
FROM customers
GROUP BY region;

-- Month-over-month percentage change
WITH monthly AS (
    SELECT
        DATE_FORMAT(report_date, '%Y-%m') AS month,
        SUM(total_shipments) AS shipments
    FROM daily_delivery_metrics
    GROUP BY DATE_FORMAT(report_date, '%Y-%m')
)
SELECT
    month,
    shipments,
    LAG(shipments) OVER (ORDER BY month) AS prev_month,
    ROUND((shipments - LAG(shipments) OVER (ORDER BY month))
          / LAG(shipments) OVER (ORDER BY month) * 100, 2) AS pct_change
FROM monthly
ORDER BY month;

-- ============================================================================
-- 3. WINDOW FUNCTIONS
-- ============================================================================

-- Running total
SELECT
    order_date,
    total_amount,
    SUM(total_amount) OVER (ORDER BY order_date) AS running_total
FROM orders
ORDER BY order_date
LIMIT 20;

-- Running total partitioned by customer
SELECT
    customer_id,
    order_date,
    total_amount,
    SUM(total_amount) OVER (PARTITION BY customer_id ORDER BY order_date) AS customer_running_total
FROM orders
ORDER BY customer_id, order_date
LIMIT 20;

-- Rank products by revenue
SELECT
    p.name,
    p.category,
    SUM(oi.quantity * oi.unit_price) AS revenue,
    RANK() OVER (ORDER BY SUM(oi.quantity * oi.unit_price) DESC) AS revenue_rank,
    DENSE_RANK() OVER (ORDER BY SUM(oi.quantity * oi.unit_price) DESC) AS dense_rank
FROM products p
JOIN order_items oi ON p.id = oi.product_id
GROUP BY p.id, p.name, p.category
LIMIT 20;

-- Rank within category
SELECT
    p.name,
    p.category,
    SUM(oi.quantity * oi.unit_price) AS revenue,
    RANK() OVER (PARTITION BY p.category ORDER BY SUM(oi.quantity * oi.unit_price) DESC) AS rank_in_category
FROM products p
JOIN order_items oi ON p.id = oi.product_id
GROUP BY p.id, p.name, p.category
ORDER BY p.category, rank_in_category;

-- Row number for pagination simulation
SELECT
    ROW_NUMBER() OVER (ORDER BY id) AS row_num,
    id,
    name,
    region
FROM customers
LIMIT 20;

-- Lead/Lag - compare to previous/next row
SELECT
    report_date,
    warehouse_name,
    total_shipments,
    LAG(total_shipments, 1) OVER (PARTITION BY warehouse_id ORDER BY report_date) AS prev_day,
    LEAD(total_shipments, 1) OVER (PARTITION BY warehouse_id ORDER BY report_date) AS next_day
FROM daily_delivery_metrics
WHERE warehouse_id = 1
ORDER BY report_date
LIMIT 20;

-- ============================================================================
-- 4. COMMON TABLE EXPRESSIONS (CTEs)
-- ============================================================================

-- Simple CTE
WITH high_value_orders AS (
    SELECT * FROM orders WHERE total_amount > 1000
)
SELECT customer_id, COUNT(*) AS high_value_count
FROM high_value_orders
GROUP BY customer_id
ORDER BY high_value_count DESC
LIMIT 10;

-- Multiple CTEs
WITH
customer_totals AS (
    SELECT customer_id, SUM(total_amount) AS lifetime_value
    FROM orders
    GROUP BY customer_id
),
customer_segments AS (
    SELECT
        customer_id,
        lifetime_value,
        CASE
            WHEN lifetime_value >= 10000 THEN 'VIP'
            WHEN lifetime_value >= 5000 THEN 'Gold'
            WHEN lifetime_value >= 1000 THEN 'Silver'
            ELSE 'Bronze'
        END AS tier
    FROM customer_totals
)
SELECT tier, COUNT(*) AS customer_count, ROUND(AVG(lifetime_value), 2) AS avg_value
FROM customer_segments
GROUP BY tier
ORDER BY avg_value DESC;

-- Recursive CTE (generate date series)
WITH RECURSIVE date_series AS (
    SELECT DATE('2024-01-01') AS dt
    UNION ALL
    SELECT dt + INTERVAL 1 DAY
    FROM date_series
    WHERE dt < '2024-01-31'
)
SELECT dt FROM date_series;

-- ============================================================================
-- 5. CONDITIONAL AGGREGATION
-- ============================================================================

-- Pivot-style report using CASE
SELECT
    warehouse_name,
    SUM(CASE WHEN MONTH(report_date) = 1 THEN total_shipments ELSE 0 END) AS jan,
    SUM(CASE WHEN MONTH(report_date) = 2 THEN total_shipments ELSE 0 END) AS feb,
    SUM(CASE WHEN MONTH(report_date) = 3 THEN total_shipments ELSE 0 END) AS mar,
    SUM(total_shipments) AS total
FROM daily_delivery_metrics
WHERE YEAR(report_date) = 2024
GROUP BY warehouse_id, warehouse_name;

-- Count with conditions
SELECT
    region,
    COUNT(*) AS total_shipments,
    SUM(CASE WHEN on_time_count > late_count THEN 1 ELSE 0 END) AS good_days,
    SUM(CASE WHEN late_count > on_time_count THEN 1 ELSE 0 END) AS bad_days
FROM daily_delivery_metrics
GROUP BY region;

-- Multiple aggregations in one query
SELECT
    p.category,
    COUNT(DISTINCT p.id) AS product_count,
    COUNT(DISTINCT oi.order_id) AS order_count,
    SUM(oi.quantity) AS units_sold,
    ROUND(SUM(oi.quantity * oi.unit_price), 2) AS gross_revenue,
    ROUND(SUM(oi.quantity * oi.unit_price * oi.discount), 2) AS total_discounts,
    ROUND(AVG(oi.unit_price), 2) AS avg_price
FROM products p
JOIN order_items oi ON p.id = oi.product_id
GROUP BY p.category
ORDER BY gross_revenue DESC;

-- ============================================================================
-- 6. SUBQUERIES
-- ============================================================================

-- Scalar subquery in SELECT
SELECT
    name,
    region,
    (SELECT COUNT(*) FROM orders o
     JOIN shipments s ON o.shipment_id = s.id
     WHERE s.warehouse_id = w.id) AS total_orders
FROM warehouses w;

-- Correlated subquery - customers with above-average orders
SELECT c.id, c.name, c.segment
FROM customers c
WHERE (SELECT AVG(total_amount) FROM orders WHERE customer_id = c.id) >
      (SELECT AVG(total_amount) FROM orders);

-- EXISTS - customers who ordered in 2024
SELECT c.id, c.name
FROM customers c
WHERE EXISTS (
    SELECT 1 FROM orders o
    WHERE o.customer_id = c.id
    AND YEAR(o.order_date) = 2024
);

-- NOT EXISTS - products never ordered
SELECT p.id, p.sku, p.name
FROM products p
WHERE NOT EXISTS (
    SELECT 1 FROM order_items oi WHERE oi.product_id = p.id
);

-- IN with subquery
SELECT * FROM customers
WHERE id IN (
    SELECT DISTINCT customer_id FROM orders
    WHERE total_amount > 2000
);

-- ============================================================================
-- 7. JOINS - BEYOND THE BASICS
-- ============================================================================

-- Self join - compare warehouses in same region
SELECT
    w1.name AS warehouse_1,
    w2.name AS warehouse_2,
    w1.region
FROM warehouses w1
JOIN warehouses w2 ON w1.region = w2.region AND w1.id < w2.id;

-- Left join with NULL check (find orphans)
SELECT p.id, p.name
FROM products p
LEFT JOIN order_items oi ON p.id = oi.product_id
WHERE oi.id IS NULL;

-- Multiple joins
SELECT
    o.id AS order_id,
    c.name AS customer,
    p.name AS product,
    oi.quantity,
    d.name AS driver,
    w.name AS warehouse
FROM orders o
JOIN customers c ON o.customer_id = c.id
JOIN order_items oi ON o.id = oi.order_id
JOIN products p ON oi.product_id = p.id
JOIN shipments s ON o.shipment_id = s.id
JOIN drivers d ON s.driver_id = d.id
JOIN warehouses w ON s.warehouse_id = w.id
LIMIT 10;

-- ============================================================================
-- 8. STRING FUNCTIONS
-- ============================================================================

SELECT
    name,
    UPPER(name) AS upper_name,
    LOWER(name) AS lower_name,
    LENGTH(name) AS name_length,
    LEFT(name, 10) AS first_10_chars,
    RIGHT(name, 5) AS last_5_chars,
    SUBSTRING(name, 1, 5) AS substr,
    REPLACE(name, ' ', '_') AS underscored,
    CONCAT(name, ' (', region, ')') AS name_with_region,
    CONCAT_WS(' - ', name, region, segment) AS joined
FROM customers
LIMIT 5;

-- Split and extract (if you have delimited data)
SELECT
    sku,
    SUBSTRING_INDEX(sku, '-', 1) AS prefix,
    SUBSTRING_INDEX(sku, '-', -1) AS suffix
FROM products
LIMIT 10;

-- ============================================================================
-- 9. NULL HANDLING
-- ============================================================================

-- COALESCE - first non-null value
SELECT
    id,
    actual_arrival,
    scheduled_arrival,
    COALESCE(actual_arrival, scheduled_arrival) AS arrival_time,
    COALESCE(actual_arrival, scheduled_arrival, NOW()) AS arrival_or_now
FROM shipments
LIMIT 10;

-- NULLIF - return NULL if values are equal (useful to avoid division by zero)
SELECT
    warehouse_name,
    total_shipments,
    late_count,
    -- Without NULLIF, this would error if total_shipments = 0
    ROUND(late_count * 100.0 / NULLIF(total_shipments, 0), 2) AS late_pct
FROM daily_delivery_metrics
LIMIT 10;

-- IFNULL / NVL equivalent
SELECT
    id,
    IFNULL(actual_arrival, 'Not yet arrived') AS arrival_status
FROM shipments
LIMIT 10;

-- Count NULLs
SELECT
    COUNT(*) AS total_shipments,
    COUNT(actual_arrival) AS completed,
    COUNT(*) - COUNT(actual_arrival) AS pending,
    SUM(CASE WHEN actual_arrival IS NULL THEN 1 ELSE 0 END) AS pending_alt
FROM shipments;

-- ============================================================================
-- 10. PERFORMANCE TIPS
-- ============================================================================

-- Use EXPLAIN to analyze query performance
EXPLAIN SELECT * FROM orders WHERE customer_id = 100;

-- Use EXPLAIN ANALYZE for actual execution stats (MariaDB 10.1+)
-- EXPLAIN ANALYZE SELECT * FROM orders WHERE customer_id = 100;

-- Check if index is being used
EXPLAIN SELECT * FROM orders WHERE order_date = '2024-06-15';

-- Force index usage (if optimizer chooses wrong one)
-- SELECT * FROM orders FORCE INDEX (idx_order_date) WHERE order_date > '2024-01-01';

-- Covering index - all columns in SELECT are in the index
-- This is fast because it doesn't need to read the actual table
EXPLAIN SELECT customer_id FROM orders WHERE customer_id = 100;

-- LIMIT early when possible
-- Bad: SELECT * FROM big_table ORDER BY col LIMIT 10
-- Good: Use indexed column for ORDER BY

-- ============================================================================
-- 11. USEFUL PATTERNS
-- ============================================================================

-- Top N per group (using ROW_NUMBER)
WITH ranked AS (
    SELECT
        p.category,
        p.name,
        SUM(oi.quantity * oi.unit_price) AS revenue,
        ROW_NUMBER() OVER (PARTITION BY p.category ORDER BY SUM(oi.quantity * oi.unit_price) DESC) AS rn
    FROM products p
    JOIN order_items oi ON p.id = oi.product_id
    GROUP BY p.id, p.category, p.name
)
SELECT category, name, revenue
FROM ranked
WHERE rn <= 3
ORDER BY category, rn;

-- Find gaps in sequences
SELECT
    a.id + 1 AS gap_start,
    MIN(b.id) - 1 AS gap_end
FROM orders a
LEFT JOIN orders b ON b.id > a.id
WHERE NOT EXISTS (SELECT 1 FROM orders c WHERE c.id = a.id + 1)
AND b.id IS NOT NULL
GROUP BY a.id
HAVING gap_start <= gap_end
LIMIT 10;

-- Deduplicate (find duplicates first)
SELECT name, region, COUNT(*) AS cnt
FROM customers
GROUP BY name, region
HAVING COUNT(*) > 1;

-- Running total that resets
SELECT
    customer_id,
    order_date,
    total_amount,
    SUM(total_amount) OVER (
        PARTITION BY customer_id, YEAR(order_date)
        ORDER BY order_date
    ) AS yearly_running_total
FROM orders
ORDER BY customer_id, order_date
LIMIT 20;

-- ============================================================================
-- 12. DATA QUALITY CHECKS
-- ============================================================================

-- Find rows with missing required data
SELECT 'orders' AS tbl, COUNT(*) AS missing_delivery
FROM orders WHERE delivery_date IS NULL AND status = 'delivered';

-- Check referential integrity (orphaned records)
SELECT 'orphaned order_items' AS issue, COUNT(*) AS cnt
FROM order_items oi
LEFT JOIN orders o ON oi.order_id = o.id
WHERE o.id IS NULL;

-- Data distribution check
SELECT
    status,
    COUNT(*) AS cnt,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) AS pct
FROM orders
GROUP BY status;

-- Outlier detection (simple: beyond 3 standard deviations)
SELECT *
FROM orders
WHERE total_amount > (SELECT AVG(total_amount) + 3 * STDDEV(total_amount) FROM orders)
   OR total_amount < (SELECT AVG(total_amount) - 3 * STDDEV(total_amount) FROM orders);

-- ============================================================================
-- 13. REPORTING QUERIES
-- ============================================================================

-- Daily summary with running totals
SELECT
    report_date,
    SUM(total_shipments) AS daily_shipments,
    SUM(SUM(total_shipments)) OVER (ORDER BY report_date) AS cumulative_shipments,
    ROUND(AVG(SUM(total_shipments)) OVER (
        ORDER BY report_date
        ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
    ), 0) AS seven_day_avg
FROM daily_delivery_metrics
GROUP BY report_date
ORDER BY report_date
LIMIT 30;

-- Year-over-year comparison
SELECT
    MONTH(order_date) AS month,
    SUM(CASE WHEN YEAR(order_date) = 2023 THEN total_amount ELSE 0 END) AS revenue_2023,
    SUM(CASE WHEN YEAR(order_date) = 2024 THEN total_amount ELSE 0 END) AS revenue_2024,
    ROUND((SUM(CASE WHEN YEAR(order_date) = 2024 THEN total_amount ELSE 0 END) -
           SUM(CASE WHEN YEAR(order_date) = 2023 THEN total_amount ELSE 0 END)) /
          NULLIF(SUM(CASE WHEN YEAR(order_date) = 2023 THEN total_amount ELSE 0 END), 0) * 100, 2) AS yoy_change
FROM orders
WHERE YEAR(order_date) IN (2023, 2024)
GROUP BY MONTH(order_date)
ORDER BY month;

-- Cohort analysis skeleton
WITH first_order AS (
    SELECT customer_id, MIN(DATE_FORMAT(order_date, '%Y-%m')) AS cohort_month
    FROM orders
    GROUP BY customer_id
)
SELECT
    f.cohort_month,
    DATE_FORMAT(o.order_date, '%Y-%m') AS order_month,
    COUNT(DISTINCT o.customer_id) AS customers
FROM first_order f
JOIN orders o ON f.customer_id = o.customer_id
GROUP BY f.cohort_month, DATE_FORMAT(o.order_date, '%Y-%m')
ORDER BY f.cohort_month, order_month
LIMIT 50;

-- ============================================================================
-- Happy querying!
-- ============================================================================
