diff --git a/common/arg.cpp b/common/arg.cpp
index 0fc94e553..267259dc2 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -10,6 +10,8 @@
 #include "speculative.h"
 #include "preset.h"
 
+#include <cstdlib> // setenv, for sparse-attn POC flags
+
 // fix problem with std::min and std::max
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
@@ -1435,6 +1437,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.n_keep = value;
         }
     ));
+    add_opt(common_arg(
+        {"--sparse-attn-sink"}, "N",
+        "sparse-attn prefill POC: keep first N KV positions (attention sink); needs --sparse-attn-window (default: 0)",
+        [](common_params & params, int value) {
+            (void) params; setenv("LLAMA_SPARSE_SINK", std::to_string(value).c_str(), 1);
+        }
+    ));
+    add_opt(common_arg(
+        {"--sparse-attn-window"}, "N",
+        "sparse-attn prefill POC: keep last N KV positions per query (local window); >=0 enables, drops the middle (fast but lossy at depth; full-attn layers only)",
+        [](common_params & params, int value) {
+            (void) params; setenv("LLAMA_SPARSE_WINDOW", std::to_string(value).c_str(), 1);
+        }
+    ));
+    add_opt(common_arg(
+        {"--sparse-attn-stride"}, "N",
+        "sparse-attn prefill POC: also keep every Nth 256-token KV block globally (dilated long-range coverage; 0=off)",
+        [](common_params & params, int value) {
+            (void) params; setenv("LLAMA_SPARSE_STRIDE", std::to_string(value).c_str(), 1);
+        }
+    ));
     add_opt(common_arg(
         {"--swa-full"},
         string_format("use full-size SWA cache (default: %s)\n"
@@ -2494,6 +2517,29 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
         }
     ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
+    add_opt(common_arg(
+        {"--dynsparse-swa"}, "N",
+        "hybrid sliding-window attention: window the attention layers of an attention+SSM hybrid (Qwen3.6, "
+        "Qwen3-Next, Granite-H) to W=N tokens (0=off). Bounds attention KV + flattens prefill at long context. "
+        "Pair with --dynsparse-swa-keepfull / --dynsparse-swa-full to backstop far recall.",
+        [](common_params & params, const std::string & value) {
+            params.dynsparse_swa = std::stoi(value);
+        }
+    ).set_env("LLAMA_DYNSPARSE_SWA"));
+    add_opt(common_arg(
+        {"--dynsparse-swa-keepfull"}, "K",
+        "with --dynsparse-swa: keep the last K attention layers FULL (un-windowed) as a far-recall backstop (default: 0)",
+        [](common_params & params, const std::string & value) {
+            params.dynsparse_swa_keepfull = std::stoi(value);
+        }
+    ).set_env("LLAMA_DYNSPARSE_SWA_KEEPFULL"));
+    add_opt(common_arg(
+        {"--dynsparse-swa-full"}, "i,j,k",
+        "with --dynsparse-swa: keep an explicit comma-separated set of attention-layer indices FULL (e.g. 27,31,35,39)",
+        [](common_params & params, const std::string & value) {
+            params.dynsparse_swa_full = value;
+        }
+    ).set_env("LLAMA_DYNSPARSE_SWA_FULL"));
     add_opt(common_arg(
         {"-sm", "--split-mode"}, "{none,layer,row,tensor}",
         "how to split the model across multiple GPUs, one of:\n"
diff --git a/common/common.cpp b/common/common.cpp
index 0dd9ede5e..93cc3c628 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1544,6 +1544,9 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     mparams.check_tensors   = params.check_tensors;
     mparams.use_extra_bufts = !params.no_extra_bufts;
     mparams.no_host         = params.no_host;
+    mparams.dynsparse_swa          = params.dynsparse_swa;
+    mparams.dynsparse_swa_keepfull = params.dynsparse_swa_keepfull;
+    mparams.dynsparse_swa_full     = params.dynsparse_swa_full.empty() ? nullptr : params.dynsparse_swa_full.c_str();
 
     if (params.kv_overrides.empty()) {
         mparams.kv_overrides = NULL;
diff --git a/common/common.h b/common/common.h
index 2adb310b8..3d14fc00c 100644
--- a/common/common.h
+++ b/common/common.h
@@ -476,6 +476,10 @@ struct common_params {
     bool    fit_params_print   = false; // print the estimated required memory to run the model
     int32_t fit_params_min_ctx = 4096;  // minimum context size to set when trying to reduce memory use
 
+    int32_t     dynsparse_swa          = 0;  // hybrid attention-layer sliding-window size W (0 = off)
+    int32_t     dynsparse_swa_keepfull = 0;  // keep last-K attention layers full (far-recall backstop)
+    std::string dynsparse_swa_full;          // explicit comma-separated attn-layer indices to keep full
+
     // margin per device in bytes for fitting parameters to free memory:
     std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
 
diff --git a/include/llama.h b/include/llama.h
index f723c9f60..b80a6da72 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -315,6 +315,11 @@ extern "C" {
         // override key-value pairs of the model meta data
         const struct llama_model_kv_override * kv_overrides;
 
+        // dynsparse-SWA (hybrid attention-layer windowing); 0/NULL = disabled (falls back to LLAMA_DYNSPARSE_SWA* env)
+        int32_t      dynsparse_swa;          // sliding-window size W for the hybrid's attention layers (0 = off)
+        int32_t      dynsparse_swa_keepfull; // keep the last-K attention layers full (far-recall backstop)
+        const char * dynsparse_swa_full;     // explicit comma-separated attn-layer indices to keep full (NULL = none)
+
         // Keep the booleans together to avoid misalignment during copy-by-value.
         bool vocab_only;      // only load the vocabulary, no weights
         bool use_mmap;        // use mmap if possible
@@ -772,6 +777,16 @@ extern "C" {
               llama_seq_id seq_id);
 
     // Check if the memory supports shifting
+    // 395aimax: evict from the BASE (non-SWA / full-attn) KV cache of a hybrid-iSWA memory.
+    // Removes every base position for `seq_id` NOT listed in keep[0..n_keep); leaves the SWA
+    // window and the recurrent (DeltaNet) state untouched. Returns false if `mem` is not
+    // hybrid-iSWA. Follow with a state save/clear/restore to physically compact (shrink n_kv).
+    LLAMA_API bool llama_memory_evict_base(
+            llama_memory_t mem,
+              llama_seq_id seq_id,
+         const llama_pos * keep,
+                   int32_t n_keep);
+
     LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
 
     //
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 8be5f28f3..69d0cf18b 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <cstdlib>
+
 #include "llama.h"
 
 #include <array>
@@ -370,6 +372,9 @@ struct llama_hparams {
     // TODO: pack the SWA params in a struct?
     static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
         assert(p0 >= 0 && p1 >= 0);
+        // OURS: env-gated attention SINK (StreamingLLM) - always keep the first K key positions attendable.
+        { static const int swa_sink = []{ const char * e = getenv("LLAMA_SWA_SINK"); return e ? atoi(e) : 0; }();
+          if (swa_sink > 0 && p0 < (llama_pos) swa_sink) { return false; } }
 
         switch (swa_type) {
             case LLAMA_SWA_TYPE_NONE:
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index d58ebac28..2dec13d94 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2291,6 +2291,9 @@ llama_model_params llama_model_default_params() {
         /*.progress_callback           =*/ nullptr,
         /*.progress_callback_user_data =*/ nullptr,
         /*.kv_overrides                =*/ nullptr,
+        /*.dynsparse_swa               =*/ 0,
+        /*.dynsparse_swa_keepfull      =*/ 0,
+        /*.dynsparse_swa_full          =*/ nullptr,
         /*.vocab_only                  =*/ false,
         /*.use_mmap                    =*/ true,
         /*.use_direct_io               =*/ false,
diff --git a/src/models/granite-hybrid.cpp b/src/models/granite-hybrid.cpp
index eb23095ae..fcef8a558 100644
--- a/src/models/granite-hybrid.cpp
+++ b/src/models/granite-hybrid.cpp
@@ -23,6 +23,9 @@ void llama_model_granite_hybrid::load_arch_hparams(llama_model_loader & ml) {
         hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0;
     }
 
+    // Generalization: env-gated SWA windowing via the shared helper (one call -- the abstraction).
+    apply_dynsparse_swa(hparams, this->params);
+
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
     switch (hparams.n_embd) {
@@ -141,8 +144,6 @@ llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_gr
 
     inpL = build_inp_embd(model.tok_embd);
 
-    auto * inp = build_inp_mem_hybrid();
-
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
     // Positional embeddings populated if rope enabled
@@ -151,6 +152,8 @@ llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_gr
         inp_pos = build_inp_pos();
     }
 
+    // Generic lambda so the loop runs with either the plain hybrid input or the hybrid-iswa input (SWA).
+    auto run_layers = [&](auto * inp) {
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * inpSA = inpL;
 
@@ -177,6 +180,9 @@ llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_gr
         // input for next layer
         inpL = cur;
     }
+    };
+    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { run_layers(build_inp_mem_hybrid_iswa()); }
+    else { run_layers(build_inp_mem_hybrid()); }
 
     cur = inpL;
 
@@ -198,9 +204,10 @@ llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_gr
     ggml_build_forward_expand(gf, cur);
 }
 
+template <typename TAttn>
 ggml_tensor * llama_model_granite_hybrid::graph::build_attention_layer(ggml_tensor *             cur,
                                                               ggml_tensor *             inp_pos,
-                                                              llm_graph_input_attn_kv * inp_attn,
+                                                              TAttn *                   inp_attn,
                                                               const llama_model &       model,
                                                               const int64_t             n_embd_head,
                                                               const int                 il) {
diff --git a/src/models/models.h b/src/models/models.h
index 7a52e7bc1..f2fad5d3a 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -7,6 +7,52 @@
 // note: almost all graphs require at least sqrtf, so include cmath globally
 #include <cmath>
 
+#include <cstdlib>
+#include <cstdio>
+#include <string>
+
+// Shared env-gated dynamic-sparse SWA marking (used by hybrid arches: qwen35moe, qwen3next, granite-hybrid, ...).
+// LLAMA_DYNSPARSE_SWA=W marks the full-attention (non-recurrent) layers sliding-window(W) so the hybrid routes
+// through hybrid-iswa -> rolling KV bounded to W. LLAMA_DYNSPARSE_SWA_FULL=comma-idx keeps listed attn layers FULL
+// (far-recall backstop). ARCH-AGNOSTIC: reads only hparams.recurrent_layer_arr. No-op unless the env var is set.
+static inline void apply_dynsparse_swa(llama_hparams & hparams, const llama_model_params & mp) {
+    // Resolve from CLI params (precedence) or LLAMA_DYNSPARSE_SWA* env (fallback / back-compat).
+    int32_t swa_w = mp.dynsparse_swa;
+    if (swa_w <= 0) { const char * e = getenv("LLAMA_DYNSPARSE_SWA"); swa_w = e ? atoi(e) : 0; }
+    if (swa_w <= 0) return;
+    int32_t keepfull = mp.dynsparse_swa_keepfull;
+    if (keepfull <= 0) { const char * e = getenv("LLAMA_DYNSPARSE_SWA_KEEPFULL"); keepfull = e ? atoi(e) : 0; }
+    const char * full_list = mp.dynsparse_swa_full;
+    if (!full_list || !*full_list) { full_list = getenv("LLAMA_DYNSPARSE_SWA_FULL"); }
+
+    const uint32_t n_main = hparams.n_layer(); // transformer layers (excludes nextn)
+    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+    hparams.n_swa = (uint32_t) swa_w;
+    for (uint32_t i = 0; i < n_main; ++i) {
+        hparams.is_swa_impl[i] = hparams.is_recr(i) ? 0u : 1u; // window the attn (non-recurrent) layers
+    }
+    if (keepfull > 0) {
+        int keep = keepfull;
+        for (int i = (int) n_main - 1; i >= 0 && keep > 0; --i) {
+            if (hparams.is_swa_impl[i]) { hparams.is_swa_impl[i] = 0u; keep--; }
+        }
+    }
+    if (full_list && *full_list) {
+        std::string fs(full_list); size_t p = 0;
+        while (p <= fs.size()) {
+            size_t c = fs.find(',', p);
+            std::string tok = fs.substr(p, c == std::string::npos ? std::string::npos : c - p);
+            if (!tok.empty()) { int idx = atoi(tok.c_str()); if (idx >= 0 && idx < (int) n_main) hparams.is_swa_impl[idx] = 0u; }
+            if (c == std::string::npos) break; p = c + 1;
+        }
+    }
+    fprintf(stderr, "[dynsparse-swa] W=%u n_main=%u windowed_attn=[", (unsigned) swa_w, n_main);
+    for (uint32_t i = 0; i < n_main; ++i) if (hparams.is_swa_impl[i]) fprintf(stderr, "%u,", i);
+    fprintf(stderr, "] kept_full_attn=[");
+    for (uint32_t i = 0; i < n_main; ++i) if (!hparams.is_recr(i) && !hparams.is_swa_impl[i]) fprintf(stderr, "%u,", i);
+    fprintf(stderr, "]\n");
+}
+
 //
 // base classes
 //
@@ -1555,7 +1601,7 @@ struct llama_model_granite_hybrid : public llama_model_base {
     struct graph : public llm_build_mamba_base {
         graph(const llama_model & model, const llm_graph_params & params);
         ggml_tensor * build_layer_ffn(ggml_tensor * cur, ggml_tensor * inpSA, const llama_model & model, const int il);
-        ggml_tensor * build_attention_layer(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn,
+        template <typename TAttn> ggml_tensor * build_attention_layer(ggml_tensor * cur, ggml_tensor * inp_pos, TAttn * inp_attn,
             const llama_model & model,const int64_t n_embd_head, const int il);
     };
 
@@ -1905,8 +1951,9 @@ struct llama_model_qwen3next : public llama_model_base {
     struct graph : public llm_build_delta_net_base {
         graph(const llama_model & model, const llm_graph_params & params);
     private:
+        template <typename TAttn>
         ggml_tensor * build_layer_attn(
-        llm_graph_input_attn_kv * inp_attn,
+        TAttn * inp_attn,
                     ggml_tensor * cur,
                     ggml_tensor * inp_pos,
                             int   il);
@@ -1992,8 +2039,9 @@ struct llama_model_qwen35moe : public llama_model_base {
     struct graph : public llm_build_delta_net_base {
         graph(const llama_model & model, const llm_graph_params & params);
     private:
+        template <class InpAttn>
         ggml_tensor * build_layer_attn(
-        llm_graph_input_attn_kv * inp_attn,
+        InpAttn * inp_attn,
                     ggml_tensor * cur,
                     ggml_tensor * inp_pos,
                             int * sections,
diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp
index 7b0876cbb..068d233b6 100644
--- a/src/models/qwen35moe.cpp
+++ b/src/models/qwen35moe.cpp
@@ -1,4 +1,6 @@
 #include "models.h"
+
+#include <string>
 #include "llama-memory-recurrent.h"
 
 void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) {
@@ -29,6 +31,8 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) {
         }
     }
 
+    apply_dynsparse_swa(hparams, this->params);
+
     switch (hparams.n_layer()) {
         case 40: type = LLM_TYPE_35B_A3B; break;
         case 48: type = LLM_TYPE_122B_A10B; break;
@@ -172,12 +176,13 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p
 
     cb(inpL, "model.input_embed", -1);
 
-    auto * inp = build_inp_mem_hybrid();
-
     ggml_tensor * inp_pos     = build_inp_pos();
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
     // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass.
+    // SWA recipe (apply_dynsparse_swa): when swa_type!=NONE the model is hybrid-iSWA; build the iswa hybrid
+    // input so the windowed attn layers attend a rolling KV. Generic lambda handles both input types.
+    auto run_transformer = [&](auto * inp) {
     for (int il = 0; il < n_layer; ++il) {
         res->t_layer_inp[il] = inpL;
 
@@ -227,6 +232,12 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p
         // Input for next layer
         inpL = cur;
     }
+    };
+    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+        run_transformer(build_inp_mem_hybrid_iswa());
+    } else {
+        run_transformer(build_inp_mem_hybrid());
+    }
     cur = inpL;
 
     // post-norm hidden state feeds both the LM head and the MTP seed below
@@ -278,8 +289,9 @@ ggml_tensor * llama_model_qwen35moe::graph::build_norm_gated(
     return ggml_mul(ctx0, normalized, gated_silu);
 }
 
+template <class InpAttn>
 ggml_tensor * llama_model_qwen35moe::graph::build_layer_attn(
-        llm_graph_input_attn_kv * inp,
+        InpAttn * inp,
         ggml_tensor *             cur,
         ggml_tensor *             inp_pos,
         int *                     sections,
@@ -346,6 +358,22 @@ ggml_tensor * llama_model_qwen35moe::graph::build_layer_attn(
                 nullptr, nullptr, nullptr,
                 Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
     cb(cur, "attn_pregate", il);
+    // OURS: env-gated HEAD ABLATION. LLAMA_ABLATE_HEAD_RANGE="a,b" zeros attn-output heads [a,b) (all attn layers).
+    { static const char * abl_env = getenv("LLAMA_ABLATE_HEAD_RANGE");
+      if (abl_env) {
+        std::string as(abl_env); size_t cpos = as.find(',');
+        int a = atoi(as.substr(0, cpos).c_str());
+        int b = (cpos == std::string::npos) ? a : atoi(as.substr(cpos + 1).c_str());
+        const int64_t N = (int64_t) n_embd_head * n_head;
+        const float fa = (float) ((int64_t) a * n_embd_head);
+        const float fb = (float) ((int64_t) b * n_embd_head);
+        // ascending aranges only; +0.5 dodges step(0). ge_a[i]=1 if i>=a*ehd, ge_b[i]=1 if i>=b*ehd.
+        ggml_tensor * ge_a = ggml_step(ctx0, ggml_arange(ctx0, 0.5f - fa, (float) N + 0.5f - fa, 1.0f));
+        ggml_tensor * ge_b = ggml_step(ctx0, ggml_arange(ctx0, 0.5f - fb, (float) N + 0.5f - fb, 1.0f));
+        ggml_tensor * in_range = ggml_sub(ctx0, ge_a, ge_b);   // 1 inside [a,b), 0 outside
+        cur = ggml_sub(ctx0, cur, ggml_mul(ctx0, cur, in_range));   // zero the ablated heads
+        cb(cur, "attn_head_ablated", il);
+      } }
 
     ggml_tensor * gate_sigmoid = ggml_sigmoid(ctx0, gate);
     cb(gate_sigmoid, "gate_sigmoid", il);
@@ -601,7 +629,12 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
     ggml_tensor * inp_pos     = build_inp_pos();
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
-    auto * inp_attn = build_attn_inp_kv();
+    // MTP+SWA: when the SWA recipe sets swa_type, the model runs on the iSWA hybrid cache, so the MTP
+    // attention input must also be iSWA (the non-iswa build_attn_inp_kv asserts swa_type==NONE). The MTP
+    // layer index (n_layer()) is not marked SWA, so it routes to the full sub-cache (attends full KV).
+    const bool mtp_use_iswa = hparams.swa_type != LLAMA_SWA_TYPE_NONE;
+    llm_graph_input_attn_kv      * inp_attn      = mtp_use_iswa ? nullptr : build_attn_inp_kv();
+    llm_graph_input_attn_kv_iswa * inp_attn_iswa = mtp_use_iswa ? build_attn_inp_kv_iswa() : nullptr;
 
     ggml_tensor * h_norm = build_norm(h_embd, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
     cb(h_norm, "mtp_hnorm", il);
@@ -658,9 +691,9 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
     const float kq_scale = hparams.f_attention_scale == 0.0f
             ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
 
-    cur = build_attn(inp_attn,
-            nullptr, nullptr, nullptr,
-            Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+    cur = mtp_use_iswa
+        ? build_attn(inp_attn_iswa, nullptr, nullptr, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il)
+        : build_attn(inp_attn,      nullptr, nullptr, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
     cb(cur, "mtp_attn_pregate", il);
 
     cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate));
diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp
index 97200a440..ad5b4df12 100644
--- a/src/models/qwen3next.cpp
+++ b/src/models/qwen3next.cpp
@@ -22,6 +22,8 @@ void llama_model_qwen3next::load_arch_hparams(llama_model_loader & ml) {
         }
     }
 
+    apply_dynsparse_swa(hparams, this->params);
+
     switch (hparams.n_layer()) {
         case 48: type = LLM_TYPE_80B_A3B; break;
         default: type = LLM_TYPE_UNKNOWN;
@@ -115,11 +117,11 @@ llama_model_qwen3next::graph::graph(const llama_model & model, const llm_graph_p
     inpL = build_inp_embd(model.tok_embd);
     cb(inpL, "model.embed_tokens", -1);
 
-    auto * inp = build_inp_mem_hybrid();
-
     ggml_tensor * inp_pos     = build_inp_pos();
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
+    // Generic lambda so the same loop runs with either the plain hybrid input or the hybrid-iswa input (SWA).
+    auto run_layers = [&](auto * inp) {
     for (int il = 0; il < n_layer; ++il) {
         ggml_tensor * inpSA = inpL;
 
@@ -167,6 +169,9 @@ llama_model_qwen3next::graph::graph(const llama_model & model, const llm_graph_p
         // Input for next layer
         inpL = cur;
     }
+    };
+    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { run_layers(build_inp_mem_hybrid_iswa()); }
+    else { run_layers(build_inp_mem_hybrid()); }
     cur = inpL;
 
     // Final norm
@@ -203,8 +208,9 @@ ggml_tensor * llama_model_qwen3next::graph::build_norm_gated(
     return ggml_mul(ctx0, normalized, gated_silu);
 }
 
+template <typename TAttn>
 ggml_tensor * llama_model_qwen3next::graph::build_layer_attn(
-        llm_graph_input_attn_kv * inp,
+        TAttn *                   inp,
         ggml_tensor *             cur,
         ggml_tensor *             inp_pos,
         int                       il) {