Cookbook
End-to-end recipes for common tasks. Every snippet compiles.
End-to-end recipes for common tasks. Every snippet compiles and
assumes import "github.com/Gaurav-Gosain/golars" plus whatever
sub-package a particular line needs.
Typed columns for compile-time literal checks
The expr package ships a typed facade (expr.C[T], expr.Int,
expr.Float, expr.Str, expr.Bool) that lets Go infer literal
types from method arguments, eliminating the expr.Lit(int64(...))
boilerplate:
import "github.com/Gaurav-Gosain/golars/expr"
qty := expr.Int("qty")
price := expr.Float("price")
out, _ := lazy.FromDataFrame(df).
Filter(expr.All(qty.Gt(2), price.Lt(50))).
WithColumns(
price.MulCol(qty.CastFloat64()).As("total").Expr,
qty.Between(2, 5).Alias("in_range"),
).
Collect(ctx)The runtime plan is identical to the untyped expr.Col("qty"). GtLit(int64(2)) form. Passing a string literal to an int-typed
column fails at build time rather than panicking at evaluation.
See examples/*/generic/ in the repository for side-by-side
comparisons.
List and struct namespaces
Expression-level helpers mirror polars' .list.* and .struct.*:
import "github.com/Gaurav-Gosain/golars/expr"
lazy.FromDataFrame(df).Select(
expr.Col("tags").List().Len().Alias("tag_count"),
expr.Col("payload").Struct().Field("x").Alias("x"),
expr.Col("csv").Str().SplitExact(",").List().Get(0).Alias("first"),
).Collect(ctx)Supported list reducers: Len, Sum, Mean, Min, Max, First,
Last, Get(idx), Contains(needle), Join(sep) (string lists).
Supported struct ops: Field(name).
Unnest / explode / upsample
Unnest a struct column:
out, _ := df.Unnest(ctx, "payload")
// struct {x:i64, y:str} becomes two top-level cols `x` and `y`.Explode a list column (null and empty lists become a single null row):
out, _ := df.Explode(ctx, "tags")
// [[a, b, c], [], NULL, [d]] produces 3 + 1 + 1 + 1 = 6 rows.Upsample a sorted timestamp column to a dense grid:
out, _ := df.Upsample(ctx, "ts", "1d")
// rows spaced > 1d apart get filled with null-valued neighbours.Accepted intervals: ns, us, ms, s, m, h, d, w.
Calendar units (mo, y) are rejected.
Pretty-print a logical plan
fmt.Println(lazy.ExplainTree(plan.Plan()))
// SORT [total desc]
// └── AGG keys=[dept] aggs=[...]
// └── FILTER (col("salary") > 75)
// └── SCAN dflazy.ExplainTreeASCII swaps the box-drawing glyphs for ASCII
fallbacks. lf.ExplainTree() is the full three-section report
(logical, optimiser, optimised) rendered as a tree.
Read a CSV, filter, write Parquet
df, _ := golars.ReadCSV("trades.csv")
defer df.Release()
out, _ := golars.Lazy(df).
Filter(golars.Col("volume").GtLit(int64(100))).
Collect(ctx)
defer out.Release()
golars.WriteParquet(out, "heavy_trades.parquet")Group + aggregate multiple columns in one pass
agg, _ := golars.Lazy(df).
GroupBy("symbol").
Agg(
golars.Sum("qty"),
golars.Mean("price").Alias("avg_price"),
golars.Max("price").Alias("hi"),
).
Sort("qty_sum", true).
Collect(ctx)golars detects that every agg targets the same column in a single
bucket and fuses them through groupby_fused.go, so four aggregations
cost one scan.
Join a CSV against a Parquet lookup table
trades, _ := golars.ReadCSV("trades.csv")
defer trades.Release()
lookup, _ := golars.ReadParquet("symbols.parquet")
defer lookup.Release()
out, _ := golars.Lazy(trades).
Join(golars.Lazy(lookup), []string{"symbol"}, golars.InnerJoin).
Collect(ctx)Scan (lazy I/O) plus predicate pushdown
import iocsv "github.com/Gaurav-Gosain/golars/io/csv"
// iocsv.Scan returns a LazyFrame that opens the file only when
// Collect runs. Combined with Filter + Select, the optimiser pushes
// the projection down through the scan.
lf := iocsv.Scan("/tmp/huge.csv").
Filter(golars.Col("region").EqLit("us")).
Select(golars.Col("symbol"), golars.Col("price"))
for batch, err := range lf.IterBatches(ctx) {
if err != nil { log.Fatal(err) }
defer batch.Release()
// stream-process each batch here
}Null handling: drop, fill, or flag
clean, _ := golars.Lazy(df).DropNulls("price", "qty").Collect(ctx)
filled, _ := golars.Lazy(df).FillNull(int64(0)).Collect(ctx)
mask, _ := df.AnyNullMask(ctx)
defer mask.Release()
// `mask` is a boolean Series you can plug back into Filter to flag
// bad rows without dropping them.Select by dtype or name predicate
import "github.com/Gaurav-Gosain/golars/selector"
numericOnly, _ := df.SelectBy(selector.Numeric())
noTimes := df.DropBy(selector.EndsWith("_ts"))
// Combinators: intersect, union, minus.
usdCols, _ := df.SelectBy(selector.Intersect(
selector.Float(),
selector.StartsWith("price_usd"),
))Cross-language with Arrow
rec := df.ToArrow() // arrow.RecordBatch
tbl := df.ToArrowTable() // arrow.Table (multi-chunk)
roundtrip, _ := dataframe.FromArrowTable(tbl)Both sides are Arrow IPC format-compatible. Write with io/ipc.Write
and read in PyArrow, pola.rs, DuckDB, or any other Arrow-aware tool
without format conversion.
String munging
out, _ := df.Apply(func(s *series.Series) (*series.Series, error) {
if s.Name() != "email" { return s.Clone(), nil }
return s.Str().Before("@")
}).Str().Before / .After / .SplitNth cover the common parsing
cases; .SplitWide returns multiple Series so you can stitch them
into a DataFrame with extra columns.
Cache an intermediate pipeline
base := golars.Lazy(df).
Filter(golars.Col("active").EqLit(true)).
Cache()
// Two downstream pipelines share the same filtered base.
top, _ := base.Sort("score", true).Head(10).Collect(ctx)
flag, _ := base.Filter(golars.Col("score").LtLit(0.5)).Collect(ctx)Cache memoises the first Collect result; subsequent collects reuse it. The cached frame is released automatically when the cache's LazyFrame handle is garbage-collected.
When / then / otherwise
out, _ := golars.Lazy(df).
Select(golars.When(golars.Col("age").Gt(golars.Lit(18))).
Then(golars.Lit("adult")).
Otherwise(golars.Lit("minor")).
Alias("category")).
Collect(ctx)Mixed numeric dtypes are promoted (int then + float otherwise -> float64 out). Null cond values are treated as false (polars semantics).
Rolling operations
// Rolling sum/mean/min/max/std/var with a fixed window.
out, _ := golars.Lazy(df).
Select(
golars.Col("price").RollingMean(30, 1).Alias("ma30"),
golars.Col("price").RollingStd(30, 5).Alias("vol30"),
).Collect(ctx)Second argument is min_periods (0 = require full window). Int64 inputs
with no nulls take a SIMD-friendly O(n) slide (two-phase warmup +
4-way unrolled step).
Regex on strings
// Boolean mask for regex hits.
mask, _ := series.FromString("s", []string{"a1", "xx", "b22"}, nil).
Str().ContainsRegex(`\d+`)
// Extract first capture group.
ids, _ := emails.Str().Extract(`@([a-z.]+)$`, 1)
// Count matches per row.
counts, _ := tokens.Str().CountMatchesRegex(`\w+`)Pivot (long -> wide)
// Mirror of polars' df.pivot(index="id", on="cat", values="v").
wide, _ := df.Pivot(ctx, []string{"id"}, "cat", "v", dataframe.PivotSum)Aggregators: PivotFirst, PivotSum, PivotMean, PivotMin, PivotMax, PivotCount.
Window functions with .Over(...)
// Per-group total broadcast back to every row.
out, _ := golars.Lazy(df).
Select(golars.Col("revenue").Sum().Over("region").Alias("region_total")).
Collect(ctx)
// Per-group rank.
ranked, _ := golars.Lazy(df).
Select(golars.Col("score").Rank("dense").Over("cohort").Alias("rank_in_cohort")).
Collect(ctx)Forward / backward fill + NaN
// Replace every NaN with 0 in float columns (integer cols pass through).
filled, _ := golars.Lazy(df).FillNan(0).Collect(ctx)
// Carry the last non-null value forward through consecutive nulls.
// Pass limit=3 to stop after three consecutive fills; limit=0 means unlimited.
ff, _ := golars.Lazy(df).ForwardFill(0).Collect(ctx)
bf, _ := golars.Lazy(df).BackwardFill(0).Collect(ctx)
// Per-column variant via Expr:
out, _ := golars.Lazy(df).
WithColumns(golars.Col("price").ForwardFill(0).Alias("price")).
Collect(ctx)Reshape: transpose, unpivot, partition
// Transpose: each input column becomes a row; output value type is
// the promoted float64 (polars' object-dtype fallback is not yet
// supported, so non-numeric inputs error).
wide, _ := df.Transpose(ctx, "column", "row")
// Unpivot (melt): turn value columns into two long-form columns:
// variable (the original column name) + value.
long, _ := df.Unpivot(ctx, []string{"id"}, nil /* default: all non-id */)
// Partition: one DataFrame per distinct key tuple; ordered by first
// appearance. Caller releases each partition.
parts, _ := df.PartitionBy(ctx, "region", "symbol")
for _, p := range parts {
defer p.Release()
}Top-K / Bottom-K / Pipe
topSellers, _ := df.TopK(ctx, 10, "revenue")
worstLatency, _ := df.BottomK(ctx, 5, "p99_ms")
// Pipe keeps chained code flat:
out, _ := df.Pipe(func(d *DataFrame) (*DataFrame, error) {
return d.Filter(ctx, mask)
})Stats: skew, kurtosis, corr, cov, approx_n_unique
sk, _ := col.Skew() // polars default (biased)
sku, _ := col.SkewUnbiased() // scipy bias=False
kk, _ := col.Kurtosis() // excess kurtosis
c, _ := a.PearsonCorr(b) // Pearson r
cov, _ := a.Covariance(b, 1) // ddof=1
corrMat, _ := df.Corr(ctx) // k-by-k frame
covMat, _ := df.Cov(ctx, 1)
approx, _ := col.ApproxNUnique() // HLL estimateExtra math helpers
// Trig + hyperbolic family: atan2, cbrt, sinh/cosh/tanh, log1p, expm1,
// radians/degrees, arccos/arcsin/arctan, cot, arcsinh/arccosh/arctanh.
radians, _ := col.Radians()
y, _ := colY.Arctan2(colX)Coalesce, concat_str, ones/zeros/int_range
picked, _ := golars.Lazy(df).
Select(golars.Coalesce(golars.Col("primary"), golars.Col("fallback"))).
Collect(ctx)
joined, _ := golars.Lazy(df).
Select(golars.ConcatStr("-", golars.Col("sym"), golars.Col("year"))).
Collect(ctx)
// Build Series out of thin air:
ids, _ := golars.Lazy(df).
WithColumns(golars.IntRange(0, 100, 1).Alias("idx")).
Collect(ctx)Arrow IPC streaming (cross-language)
sw, _ := golars.NewIPCStreamWriter(conn, firstBatch)
for batch := range batches {
sw.Write(ctx, batch)
}
sw.Close()
// Consumer side (also polars/pyarrow/DuckDB-compatible):
sr, _ := golars.NewIPCStreamReader(conn)
defer sr.Close()
for batch, err := range sr.Iter(ctx) {
if err != nil { log.Fatal(err) }
process(batch)
batch.Release()
}Row-wise (horizontal) reductions
// Append a column that sums three others on a per-row basis.
withTotal, _ := golars.Lazy(df).
SumHorizontal("total", "q1", "q2", "q3").
Collect(ctx)
defer withTotal.Release()
// Or compute the reduction directly as a standalone Series.
total, _ := golars.SumHorizontal(ctx, df, "q1", "q2", "q3")
defer total.Release()Variants: SumHorizontal, MeanHorizontal, MinHorizontal,
MaxHorizontal, AllHorizontal, AnyHorizontal. Omit the column
list to span every numeric (or boolean) column. Null handling
defaults to IgnoreNulls; pass dataframe.PropagateNulls on the
frame-level method for polars' strict semantics.
One-row frame-level aggregates
sums, _ := df.SumAll(ctx) // one row, one column per numeric input
means, _ := df.MeanAll(ctx)
counts, _ := df.CountAll(ctx) // counts non-nulls for every column
nulls, _ := df.NullCountAll(ctx)These mirror polars' df.sum(), df.mean(), df.count(). They are
convenient for dashboards, describe-style summaries, and streamed
ETL checkpoints.
Lazy scans (pushdown-friendly I/O)
lf := golars.ScanCSV("huge.csv").
Filter(golars.Col("region").EqLit("us")).
Select(golars.Col("symbol"), golars.Col("price"))
out, _ := lf.Collect(ctx)Every format has a scan entry point: ScanCSV, ScanParquet,
ScanIPC, ScanJSON, ScanNDJSON. Compared to Read*, a scan
defers opening the file until Collect so the optimiser can
push projections + filters into the reader.
Rank and percent-change
r, _ := golars.Lazy(df).
Select(
golars.Col("score").Rank("dense").Alias("rank"),
golars.Col("score").PctChange(1).Alias("delta"),
).
Collect(ctx)Apply a custom Go function
out, _ := df.Apply(func(s *series.Series) (*series.Series, error) {
switch s.DType().String() {
case "i64":
return s.ApplyInt64(func(v int64) int64 { return v * 2 })
case "str":
return s.Str().Upper()
}
return s.Clone(), nil
})REPL / scripting quickies
Inside golars or in a .glr file:
load data/trades.csv
filter volume > 100
with_row_index row
cast price f64
fill_null 0
rename volume as qty
sum qty
write out.parquetScalar-only prints: .sum COL, .mean COL, .min COL, etc. write
one-line results instead of a table, convenient for quick spot
checks.
Per-language equivalents
| polars (Python) | golars (Go) |
|---|---|
pl.read_csv(p) | golars.ReadCSV(p) |
df.filter(pl.col("x") > 5) | df.Filter(ctx, mask) or lazy with golars.Col("x").GtLit(5) |
df.group_by("k").agg(pl.sum("v")) | df.GroupBy("k").Agg(ctx, []expr.Expr{expr.Col("v").Sum()}) |
df.unique() | df.Unique(ctx) |
df.sample(n=10) | df.Sample(ctx, 10, false, seed) |
df.with_row_index() | df.WithRowIndex("index", 0) |
pl.col("s").str.to_uppercase() | s.Str().Upper() |
df.select(cs.numeric()) | df.SelectBy(selector.Numeric()) |
pl.sum_horizontal("a", "b") | golars.SumHorizontal(ctx, df, "a", "b") |
df.fill_nan(0) | lf.FillNan(0) |
df.fill_null(strategy="forward") | lf.ForwardFill(0) |
df.fill_null(strategy="backward") | lf.BackwardFill(0) |
df.top_k(10, by="x") | df.TopK(ctx, 10, "x") |
df.transpose() | df.Transpose(ctx, "column", "row") |
df.unpivot(index=["id"]) | df.Unpivot(ctx, []string{"id"}, nil) |
df.partition_by("k") | df.PartitionBy(ctx, "k") |
df.corr() | df.Corr(ctx) |
pl.coalesce(...) | golars.Coalesce(...) |
pl.concat_str(..., sep) | golars.ConcatStr(sep, ...) |
pl.int_range(0, n) | golars.IntRange(0, int64(n), 1) |
pl.when(p).then(a).otherwise(b) | golars.When(p).Then(a).Otherwise(b) |
pl.col("x").rolling_sum(w) | golars.Col("x").RollingSum(w, 0) |
pl.col("x").sum().over("k") | golars.Col("x").Sum().Over("k") |
s.str.extract(pattern, i) | s.Str().Extract(pattern, i) |
s.str.contains_regex(p) | s.Str().ContainsRegex(p) |
df.pivot(index=, on=, values=) | df.Pivot(ctx, index, on, values, agg) |
df.sum() | df.SumAll(ctx) |
pl.scan_csv(p) | golars.ScanCSV(p) |
pl.scan_parquet(p) | golars.ScanParquet(p) |
See docs/api-surface.md for the full cross-reference.