# BinStatistics.jl

Highly flexible and efficient computation of n-dimensional binned statistic(s) for n-variable(s)

BinStatistics provides the binstats function that is build on top of DataFrames.jl and CatagoricalArrays.jl

## binstats function

"""
binstats(df, axis_col, axis_edges, bin_col;
grp_function = [nrow], col_function = [mean], missing_bin = false)

Returns a DataFrame containing function values for binned variables of df.

# Arguments
- axis_col: binning axes column(s)
- axis_edges: bin edges for axes column(s) axis_col
- bin_col: column(s) to be binned
- grp_function = [nrow]: column independent funciton(s) to be applied at group level
- var_function = [mean]: column dependent funciton(s) to be applied to bin_col at group level
- missing_bin = false: include missing bins
"""


## Examples

using BinStatistics
using DataFrames
using CairoMakie
using Statistics


### make synthetic data

begin
n = 1000000;
df = DataFrame();
df.x = rand(n).*20;
df.y = rand(n).*20;
df.v1 = cos.(df.x) .+ randn(n)*3;
df.v2 = cos.(df.x .- df.y) .+ sin.(df.x .+ df.y) .+ randn(n)*3;
df.v3 = df.v1 .+ df.v2;
end


### Example 1: calculate count/nrow and mean of v1 binned according to x

df1 = binstats(df, :x, 0:0.1:20, :v1)

200×3 DataFrame
Row │ x             nrow   v1_mean
│ String        Int64  Float64
─────┼───────────────────────────────
1 │ [0.0, 0.1)     5081  0.973442
2 │ [0.1, 0.2)     5079  1.03621
⋮  │      ⋮          ⋮       ⋮
199 │ [19.8, 19.9)   5201  0.521451
200 │ [19.9, 20.0)   5050  0.490683


### Example 2: calculate count/nrow and medain of v1 and v2 binned according to x

df2 = binstats(df, :x, 0:0.1:20, ["v1", "v2"])

200×4 DataFrame
Row │ x             nrow   v1_mean   v2_mean
│ String        Int64  Float64   Float64
─────┼──────────────────────────────────────────
1 │ [0.0, 0.1)     5081  0.973442  0.0567808
2 │ [0.1, 0.2)     5079  1.03621   0.0653569
⋮  │      ⋮          ⋮       ⋮          ⋮
199 │ [19.8, 19.9)   5201  0.521451  0.0459481
200 │ [19.9, 20.0)   5050  0.490683  0.0915996


### Example 3: calculate count/nrow, mean, medain and std of v1 binned according to x

df3 = binstats(df, :x, 0:0.1:20, :v1; col_function = [mean, median, std])

200×5 DataFrame
Row │ x             nrow   v1_mean   v1_median  v1_std
│ String        Int64  Float64   Float64    Float64
─────┼───────────────────────────────────────────────────
1 │ [0.0, 0.1)     5081  0.973442   0.973191  2.97307
2 │ [0.1, 0.2)     5079  1.03621    1.02465   2.99917
⋮  │      ⋮          ⋮       ⋮          ⋮         ⋮
199 │ [19.8, 19.9)   5201  0.521451   0.436727  3.01971
200 │ [19.9, 20.0)   5050  0.490683   0.482965  3.00587


### Example 4: calculate count/nrow and mean of v2 binned according to y and x

df4 = binstats(df, [:y, :x], [0:.2:20, 0:.2:20], [:v2]; missing_bins = true)

10000×4 DataFrame
Row │ y             x             nrow   v2_mean
│ String        String        Int64  Float64
───────┼────────────────────────────────────────────
1 │ [0.0, 0.2)    [0.0, 0.2)      104  1.11192
2 │ [0.0, 0.2)    [0.2, 0.4)       87  1.40544
⋮   │      ⋮             ⋮          ⋮       ⋮
9999 │ [19.8, 20.0)  [19.6, 19.8)     87  1.8668
10000 │ [19.8, 20.0)  [19.8, 20.0)    106  2.04332


### Example 5: calculate median of v2 binned according to y and x using non-uniform axis_edges

df5 = binstats(df, [:y, :x], [(0:0.5:4.5).^2, (0:0.5:4.5).^2], [:v2], grp_function = [], col_function = [median], missing_bins = true)

81×3 DataFrame
Row │ y              x              v2_median
│ String         String         Float64
─────┼───────────────────────────────────────────
1 │ [0.0, 0.25)    [0.0, 0.25)     0.940375
2 │ [0.0, 0.25)    [0.25, 1.0)     1.76134
⋮  │       ⋮              ⋮             ⋮
80 │ [16.0, 20.25)  [12.25, 16.0)  -0.0137548
81 │ [16.0, 20.25)  [16.0, 20.25)  -0.00810516


## Plotting script

# Example 1
begin
fig = Figure()
Axis(fig[1, 1], title = "raw data")
scatter!(fig[1, 1], df.x, df.v1)
Axis(fig[1, 2], title = "binned data")
scatter!(fig[1, 2], bincenter.(df1[:,1]), df1.v1_mean)
fig
end

# Example 2
begin
fig = Figure()
Axis(fig[1, 1], title = "raw data")
scatter!(fig[1, 1], df.x, df.v1)
Axis(fig[1, 2], title = "binned data")
scatter!(fig[1, 2], bincenter.(df2[:,1]), df2.v1_mean, label = "v1")
scatter!(fig[1, 2], bincenter.(df2[:,1]), df2.v2_mean, label = "v2")
axislegend()
fig
end

# Example 3
begin
fig = Figure()
Axis(fig[1, 1], title = "raw data")
scatter!(fig[1, 1], df.x, df.v1)
Axis(fig[1, 2], title = "binned data")
scatter!(fig[1, 2], bincenter.(df3[:,1]), df3.v1_mean, label = "mean")
scatter!(fig[1, 2], bincenter.(df3[:,1]), df3.v1_median, label = "median")
scatter!(fig[1, 2], bincenter.(df3[:,1]), df3.v1_std, label = "std")
axislegend()
fig
end

# Example 4
begin
fig = Figure()
Axis(fig[1, 1], title = "raw data")
scatter!(fig[1, 1], df.y, df.x, color = df.v2, colormap = :thermal, markersize = 1)
xlims!(0, 20); ylims!(0, 20)
Axis(fig[1, 2], title = "binned data")
heatmap!(fig[1, 2], unique(bincenter.(df4[:,1])),unique(bincenter.(df4[:,2])),
reshape(df4.v2_mean,length(unique(df4[:,2])),length(unique(df4[:,1]))),
colormap = :thermal)
fig
end

# Example 5
begin
fig = Figure()
Axis(fig[1, 1], title = "raw data")
scatter!(fig[1, 1], df.y, df.x, color = df.v2, colormap = :thermal, markersize = 1)
xlims!(0, 20); ylims!(0, 20)
Axis(fig[1, 2], title = "binned data")
heatmap!(fig[1, 2], unique(bincenter.(df5[:,1])),unique(bincenter.(df5[:,2])),
reshape(df5.v2_mean,length(unique(df5[:,2])),length(unique(df5[:,1]))), colormap = :thermal)
fig
end