// hakmem_ucb1.c - UCB1 Bandit Implementation
// Purpose: Automatic policy evolution via reinforcement learning
//
// License: MIT
// Date: 2025-10-21

#include "hakmem.h"
#include <math.h>
#include <string.h>
#include <time.h>
#include <stdio.h>

// ============================================================================
// Configuration
// ============================================================================

#define UCB1_EXPLORATION_FACTOR 2.0     // √(2 × ln(N) / n)
#define HYSTERESIS_IMPROVE_PCT 0.08     // 8% improvement required
#define HYSTERESIS_CONSECUTIVE 3        // 3 consecutive improvements
#define COOLDOWN_SECS 180               // 3 minutes cooldown

// ============================================================================
// Discrete Step Tables
// ============================================================================

// mmap_threshold steps (6 levels)
static const size_t MMAP_THRESHOLD_STEPS[STEP_COUNT] = {
    64 * 1024,    // 64KB
    128 * 1024,   // 128KB
    256 * 1024,   // 256KB
    512 * 1024,   // 512KB
    1024 * 1024,  // 1MB
    2048 * 1024,  // 2MB
};

// Convert step enum to bytes
static inline size_t step_to_bytes(MmapThresholdStep step) {
    if (step >= STEP_COUNT) return MMAP_THRESHOLD_STEPS[STEP_COUNT - 1];
    return MMAP_THRESHOLD_STEPS[step];
}

// ============================================================================
// UCB1 State (per call-site)
// ============================================================================

typedef struct {
    // Per-step statistics
    double avg_reward[STEP_COUNT];       // Average reward for each step
    uint64_t step_trials[STEP_COUNT];    // Trial count for each step

    // Current state
    MmapThresholdStep current_step;      // Currently active step
    uint64_t total_trials;               // Total trials across all steps

    // Hysteresis (safety mechanism)
    MmapThresholdStep candidate_step;    // Candidate for next step
    uint32_t consecutive_count;          // Consecutive improvements

    // Cooldown (stability mechanism)
    uint64_t last_adoption_time_ms;      // Last time we adopted a new step
} UCB1State;

// ============================================================================
// Global UCB1 State
// ============================================================================

static UCB1State g_ucb1_states[256];  // Per-site UCB1 state (simplified)
static int g_evolution_enabled = 0;   // 0 = baseline, 1 = evolving

// ============================================================================
// KPI Measurement
// ============================================================================

// Get current timestamp in milliseconds
static uint64_t get_time_ms(void) {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
}

// Normalize value to [0, 1] range
static double normalize(double value, double min, double max) {
    double range = max - min;
    if (range < 0.001) return 0.5;  // Avoid division by zero
    return (value - min) / range;
}

// Calculate reward from KPI
// Lower is better (negative reward for high latency/PF/RSS)
static double calculate_reward(const hak_kpi_t* kpi) {
    // Normalize KPIs to [0, 1]
    double norm_p99 = normalize((double)kpi->p99_alloc_ns, 0.0, 1000.0);  // 0-1000ns
    double norm_pf = normalize((double)kpi->hard_page_faults, 0.0, 100.0); // 0-100 faults
    double norm_rss = normalize((double)kpi->rss_delta_mb, -10.0, 10.0);  // -10MB ~ +10MB

    // Reward = minimize cost
    // P99 is most important (weight 1.0)
    // Page Faults are moderately important (weight 0.5)
    // RSS is less important (weight 0.2)
    double reward = -(norm_p99 + 0.5 * norm_pf + 0.2 * norm_rss);

    return reward;
}

// ============================================================================
// UCB1 Algorithm
// ============================================================================

// Calculate UCB1 score for a step
static double ucb1_score(
    const UCB1State* state,
    MmapThresholdStep step
) {
    // If never tried, return infinity (prioritize exploration)
    if (state->step_trials[step] == 0) {
        return INFINITY;
    }

    // UCB1 formula: avg_reward + √(C × ln(N) / n)
    // C = exploration factor (2.0)
    // N = total trials
    // n = trials for this step
    double avg_reward = state->avg_reward[step];
    double exploration_bonus = sqrt(
        UCB1_EXPLORATION_FACTOR * log((double)state->total_trials) /
        (double)state->step_trials[step]
    );

    return avg_reward + exploration_bonus;
}

// Select best step using UCB1 (only ±1 neighbors)
static MmapThresholdStep ucb1_select(const UCB1State* state) {
    MmapThresholdStep current = state->current_step;
    MmapThresholdStep best_step = current;
    double best_score = ucb1_score(state, current);

    // Try previous step (if exists)
    if (current > 0) {
        MmapThresholdStep prev = (MmapThresholdStep)(current - 1);
        double score = ucb1_score(state, prev);
        if (score > best_score) {
            best_score = score;
            best_step = prev;
        }
    }

    // Try next step (if exists)
    if (current < STEP_COUNT - 1) {
        MmapThresholdStep next = (MmapThresholdStep)(current + 1);
        double score = ucb1_score(state, next);
        if (score > best_score) {
            best_score = score;
            best_step = next;
        }
    }

    return best_step;
}

// Update statistics for a step
static void ucb1_update(UCB1State* state, MmapThresholdStep step, double reward) {
    // Update running average
    uint64_t n = state->step_trials[step];
    if (n == 0) {
        state->avg_reward[step] = reward;
    } else {
        // Incremental average: avg_new = (avg_old × n + reward) / (n + 1)
        state->avg_reward[step] = (state->avg_reward[step] * n + reward) / (n + 1);
    }

    // Update counts
    state->step_trials[step]++;
    state->total_trials++;
}

// ============================================================================
// Hysteresis (Safety Mechanism)
// ============================================================================

static int hysteresis_should_adopt(
    UCB1State* state,
    MmapThresholdStep new_step,
    double current_reward,
    double new_reward
) {
    // Calculate improvement percentage
    double improvement = (new_reward - current_reward) / fabs(current_reward);

    // Check if improvement is sufficient
    if (improvement >= HYSTERESIS_IMPROVE_PCT) {
        // Same candidate as before?
        if (state->candidate_step == new_step) {
            state->consecutive_count++;
        } else {
            // New candidate, reset counter
            state->candidate_step = new_step;
            state->consecutive_count = 1;
        }

        // Reached threshold?
        if (state->consecutive_count >= HYSTERESIS_CONSECUTIVE) {
            // Reset and adopt!
            state->consecutive_count = 0;
            state->candidate_step = (MmapThresholdStep)-1;  // Invalid
            return 1;
        }
    } else {
        // Improvement insufficient, reset
        state->consecutive_count = 0;
        state->candidate_step = (MmapThresholdStep)-1;
    }

    return 0;
}

// ============================================================================
// Cooldown (Stability Mechanism)
// ============================================================================

static int cooldown_can_adjust(const UCB1State* state) {
    if (state->last_adoption_time_ms == 0) {
        return 1;  // First time, always OK
    }

    uint64_t now = get_time_ms();
    uint64_t elapsed_ms = now - state->last_adoption_time_ms;
    uint64_t elapsed_secs = elapsed_ms / 1000;

    return elapsed_secs >= COOLDOWN_SECS;
}

static void cooldown_record_adoption(UCB1State* state) {
    state->last_adoption_time_ms = get_time_ms();
}

// ============================================================================
// Evolution Cycle (Main Logic)
// ============================================================================

void hak_trigger_evolution(void) {
    if (!g_evolution_enabled) {
        return;  // Evolution disabled
    }

    printf("\n[UCB1] Evolution cycle triggered\n");

    // For PoC, we only evolve the first active site
    // Real implementation would iterate all sites
    UCB1State* state = &g_ucb1_states[0];

    // 1. Check cooldown
    if (!cooldown_can_adjust(state)) {
        printf("[UCB1] Cooldown active, skipping evolution\n");
        return;
    }

    // 2. Measure current KPI
    hak_kpi_t kpi;
    hak_get_kpi(&kpi);

    // 3. Calculate reward
    double reward = calculate_reward(&kpi);
    printf("[UCB1] Current reward: %.3f (P99=%lu ns, PF=%lu)\n",
           reward,
           (unsigned long)kpi.p99_alloc_ns,
           (unsigned long)kpi.hard_page_faults);

    // 4. Update statistics for current step
    ucb1_update(state, state->current_step, reward);

    // 5. Select best step using UCB1
    MmapThresholdStep best_step = ucb1_select(state);

    printf("[UCB1] UCB1 selected step: %d (current: %d)\n",
           best_step, state->current_step);

    // 6. Check if different from current
    if (best_step != state->current_step) {
        // Get current reward (average of current step)
        double current_reward = state->avg_reward[state->current_step];
        double new_reward = state->avg_reward[best_step];

        // 7. Hysteresis check
        if (hysteresis_should_adopt(state, best_step, current_reward, new_reward)) {
            printf("[UCB1] ✅ ADOPTING new step %d → %d (improvement: %.1f%%)\n",
                   state->current_step, best_step,
                   (new_reward - current_reward) / fabs(current_reward) * 100.0);

            state->current_step = best_step;
            cooldown_record_adoption(state);
        } else {
            printf("[UCB1] Hysteresis: need %d more consecutive improvements\n",
                   HYSTERESIS_CONSECUTIVE - state->consecutive_count);
        }
    }

    printf("[UCB1] Current step: %d (%zu bytes)\n",
           state->current_step, step_to_bytes(state->current_step));
}

// ============================================================================
// Public API
// ============================================================================

void hak_enable_evolution(int enable) {
    g_evolution_enabled = enable;

    if (enable) {
        printf("[UCB1] Evolution ENABLED\n");

        // Initialize UCB1 state (simplified: first site only)
        UCB1State* state = &g_ucb1_states[0];
        memset(state, 0, sizeof(UCB1State));
        state->current_step = STEP_256KB;  // Start at 256KB (reasonable default)
        state->candidate_step = (MmapThresholdStep)-1;  // Invalid
    } else {
        printf("[UCB1] Evolution DISABLED (baseline mode)\n");
    }
}

// Get current step for a site (simplified: always site 0)
MmapThresholdStep hak_ucb1_get_step(void) {
    if (!g_evolution_enabled) {
        return STEP_256KB;  // Baseline default
    }
    return g_ucb1_states[0].current_step;
}

// Get step size in bytes
size_t hak_ucb1_get_threshold(void) {
    MmapThresholdStep step = hak_ucb1_get_step();
    return step_to_bytes(step);
}