A programming language for LLM inference

Write the forward pass in source code. Compile through MLIR to CUDA, CPU-SIMD, MLX, or ROCm. The runtime is the program.

fn attention_f16(q: Tensor<f32, [D]>,
                 key_cache: Tensor<f16, [L, SeqLen, KvDim]>,
                 value_cache: Tensor<f16, [L, SeqLen, KvDim]>,
                 layer: i32, pos: i32, H: i32, scale: f32) -> Tensor<f32, [D]>
    with tile=[8, 64], parallel=[h, t] {

  var att: Tensor<f32, [H, SeqLen]> = zeros([H, SeqLen])

  # Compute attention scores
  att[h, t] = if t > pos { -inf } else {
    sum(i) q[h * Dh + i] * (key_cache[layer, t, h * Dh + i] as f32) * scale
  }

  var weights: Tensor<f32, [H, SeqLen]> = softmax(att)
  # ... weighted sum over values
}
// MLIR after gpu-kernel-outlining (CUDA path)
module {
  func.func @attention_f16(%q: memref<?xf32>, %keys: memref<?x?x?xf16>, ...) {
    %c16 = arith.constant 16 : index
    %dim_H = memref.dim %att, %c0 : memref<?x?xf32>
    %dim_T = memref.dim %att, %c1 : memref<?x?xf32>

    gpu.launch blocks(%bh, %bt, %_) in (%grid_h, %grid_t, %c1)
               threads(%th, %tt, %_) in (%c16, %c16, %c1) {
      %h = arith.addi %bh, %th : index
      %t = arith.addi %bt, %tt : index

      // Reduction loop for dot product
      %score = scf.for %i = %c0 to %Dh iter_args(%acc = %cst) {
        %q_val = memref.load %q[%idx] : memref<?xf32>
        %k_val = memref.load %keys[%layer, %t, %idx] : memref<?x?x?xf16>
        %k_f32 = arith.extf %k_val : f16 to f32
        %prod = arith.mulf %q_val, %k_f32 : f32
        scf.yield %sum : f32
      }
      memref.store %score, %att[%h, %t]
      gpu.terminator
    }
  }
}

The problem with current LLM runtimes

Most inference stacks are monolithic. The forward pass is buried in C++ or Python calling C++. You can't read it, you can't edit it, you can't retarget it. Want to change attention? Fork the repo. Want to run on different hardware? Wait for support. The model logic is locked inside the runtime.

How TensaLang works

TensaLang makes LLM logic source code. RMSNorm, RoPE, attention, MLP, sampling, all written in .tl files. The compiler uses MLIR as its core IR, then lowers to your target hardware. Write once, compile to any backend.

.tl source
Your model code
MLIR
linalg / scf / gpu
CUDA
CPU-SIMD
MLX / ROCm
planned

Scheduling in source

Tile sizes, parallel indices, memory placement, all expressed with with clauses. No compiler hacking required.

Builtins are overridable

Don't like the default softmax? Define your own. The compiler uses your implementation instead.

Real compiler pipeline

Not a DSL that emits strings. Full MLIR IR with optimization passes, then target-specific lowering.

Real code from the repo

These snippets are from working implementations of Llama2 and Qwen2.5-Coder running on CUDA and CPU.

# Matrix-vector multiplication with FP16 weights
fn matmul_vec_f16(w: Tensor<f16, [O, I]>, x: Tensor<f32, [I]>) -> Tensor<f32, [O]>
    with parallel=[o] {
  var y: Tensor<f32, [O]>
  y[o] = sum(i) (w[o, i] as f32) * x[i]
  return y
}

# The compiler recognizes this pattern and can dispatch to cuBLAS GEMV
# when running on CUDA with compatible memory layouts.
# RMSNorm with FP16 weights, FP32 computation
fn rmsnorm(x: Tensor<f32, [D]>, weight: Tensor<f16, [D]>, eps: f32) -> Tensor<f32, [D]> {
  const n = dim(x, 0)
  var ss: Tensor<f32, [1]>
  ss[0] = sum(i) x[i] * x[i]
  const inv: f32 = 1.0 / sqrt(ss[0] / n + eps)

  var y: Tensor<f32, [D]>
  y[i] = (weight[i] as f32) * (x[i] * inv)
  return y
}
# Transformer layer from Llama2 implementation
for l in 0..L {
  const wq: Tensor<f16, [D, D]> = st[layer_key("layers.", l, ".wq")]
  const wk: Tensor<f16, [KvDim, D]> = st[layer_key("layers.", l, ".wk")]
  # ... load other weights

  var xb: Tensor<f32, [D]> = rmsnorm(x, rms_att, 1e-5)
  var q: Tensor<f32, [D]> = matmul_vec_f16(wq, xb)
  var k: Tensor<f32, [KvDim]> = matmul_vec_f16(wk, xb)
  var v: Tensor<f32, [KvDim]> = matmul_vec_f16(wv, xb)

  rope_kv_f16(q, k, v, key_cache, value_cache, l, pos, H, KvH)
  var xb_att: Tensor<f32, [D]> = attention_f16_fused(q, key_cache, value_cache, ...)

  # MLP block with SiLU activation
  var hb: Tensor<f32, [Hidden]> = matmul_vec_f16(w1, xb_ffn)
  var hb2: Tensor<f32, [Hidden]> = matmul_vec_f16(w3, xb_ffn)
  var h: Tensor<f32, [Hidden]> = silu_mul(hb, hb2)
  add_inplace(x, matmul_vec_f16(w2, h))
}

One source, multiple targets

The same .tl code compiles to different backends. MLIR's dialect system handles the target-specific lowering.

attn_scores.tl
fn attn_scores(q: Tensor<f32, [H, Dh]>,
               k: Tensor<f16, [T, Dh]>,
               scale: f32)
    -> Tensor<f32, [H, T]>
    with tile=[8, 64], parallel=[h, t] {

  var s: Tensor<f32, [H, T]>
  s[h, t] = sum(i) q[h, i] *
            (k[t, i] as f32) * scale
  return s
}
CUDA kernel launch CPU vectorized loop
// GPU dialect → NVVM → cubin
gpu.launch blocks(%bh, %bt, %_)
           in (%grid_h, %grid_t, %c1)
           threads(%th, %tt, %_)
           in (%c8, %c64, %c1) {
  %h = arith.addi %bh, %th
  %t = arith.addi %bt, %tt
  // Reduction compiled to warp shuffle
  %score = scf.for ... iter_args(%acc)
  memref.store %score, %out[%h, %t]
  gpu.terminator
}
// LLVM IR with SIMD intrinsics
scf.parallel (%h, %t) = (%c0, %c0)
             to (%H, %T)
             step (%c1, %c1) {
  // Vectorized reduction (AVX2/AVX-512)
  %vec = vector.transfer_read
  %dot = vector.contract %q_vec, %k_vec
  %scaled = arith.mulf %dot, %scale
  memref.store %scaled, %out[%h, %t]
}

Quick start

Clone, build, run. Requires LLVM 18 and CUDA toolkit.

# Clone and build
git clone https://github.com/BenChaliah/Tensa-Lang.git
cd Tensa-Lang && ./build.sh

# Download example weights
git clone https://huggingface.co/DatarusAI/Tensa-Lang models

# Run Llama2 fp16 on CUDA
./bin/tensalang-run examples/llama2_manual_tiling_fp16.tl \
  --model models/llama2_7b/llama2_7b_f16.safetensors \
  --tokenizer models/llama2_7b/tokenizer.json \
  --prompt "Once upon a time" \
  --target cuda --cuda-arch sm_89
# Build the Docker image
docker build -f docker/Dockerfile -t tensalang:local .

# Run with the helper script
./docker_command_exec examples/llama2_manual_tiling_fp16.tl \
  --model /path/to/llama2_7b_f16.safetensors \
  --tokenizer /path/to/tokenizer.json \
  --prompt "Once upon a time" \
  --target cuda --cuda-arch sm_89

I've written kernels and dug through inference runtimes. Every time I wanted to explore a new AI model architecture, it was the same, weeks wiring ops into a massive codebase, working around abstractions that were not designed to be rewritten. And when throughput was off, the IR you could inspect was either too low-level or too tied to one backend's execution model.

I wanted a language where tensors are first-class and hardware targets are interchangeable. Where tiling lives in the source and the IR preserves your algorithm's structure, tensor ops, loop nests, parallel dimensions, not thread layouts for a specific GPU.

18 months later, that's TensaLang.

Built by @BenChaliah

From the co-creator of Datarus-R1-14B.

Made in France 🇫🇷