Custom Histogram Bins Using the breaks Parameter¶

In [1]:
%useLatestDescriptors
%use dataframe
%use lets-plot(output="js, svg")
In [2]:
LetsPlot.getInfo()
Out[2]:
Lets-Plot Kotlin API v.4.12.0. Frontend: Notebook with dynamically loaded JS. Lets-Plot JS v.4.8.1.
Outputs: Web (HTML+JS), Static SVG (hidden)
In [3]:
val url = "https://raw.githubusercontent.com/JetBrains/lets-plot-docs/refs/heads/master/data/diamonds.csv"
val df = DataFrame.readCSV(url)
val data = df.toMap()
println("${df.rowsCount()} x ${df.columnsCount()}")
df.head()
53940 x 10
Out[3]:

DataFrame: rowsCount = 5, columnsCount = 10

caratcutcolorclaritydepthtablepricexyz
0,230000IdealESI261,50000055,0000003263,9500003,9800002,430000
0,210000PremiumESI159,80000061,0000003263,8900003,8400002,310000
0,230000GoodEVS156,90000065,0000003274,0500004,0700002,310000
0,290000PremiumIVS262,40000058,0000003344,2000004,2300002,630000
0,310000GoodJSI263,30000058,0000003354,3400004,3500002,750000

Default Bins¶

In [4]:
letsPlot(data) { x = "price" } +
    geomHistogram(color = "black", fill = "gray80")
Out[4]:
0 5,000 10,000 15,000 20,000 0 2,000 4,000 6,000 8,000 10,000 12,000 14,000 count price

Equi-probable Bins¶

In [5]:
val prices = data["price"]
    ?.filterIsInstance<Number>()
    ?.map(Number::toDouble)
    ?.sorted()
    ?: emptyList()
    
val n = prices.size
val priceBins = (0..10).map { i ->
    val pos = i * (n - 1) / 10.0
    val lower = prices[floor(pos).toInt()]
    val upper = prices[ceil(pos).toInt()]
    lower + (upper - lower) * (pos - floor(pos))
}

letsPlot(data) { x = "price" } +
    geomHistogram(
        breaks = priceBins,
        color = "black",
        fill = "gray80"
    ) +
    themeClassic()
Out[5]:
0 2,000 4,000 6,000 8,000 10,000 12,000 14,000 16,000 18,000 0 1,000 2,000 3,000 4,000 5,000 count price

Identity Stat¶

In [6]:
val breaks = (0..15).map { 2.0.pow(it) }
val centers = breaks.zip(breaks.drop(1)) { a, b -> (a + b) / 2.0 }

fun binCenter(v: Double?, breaks: List<Double>): Double? =
    if (v == null || v < breaks.first() || v > breaks.last()) null
    else breaks.zipWithNext().firstOrNull { (a, b) -> v >= a && v < b }?.let { (a, b) -> (a + b) / 2.0 }

val dfWithBin = df
    .convert { "price"<Number?>() }.to<Double?>()
    .add("bin_x") { binCenter(this["price"] as Double?, breaks) }   // bin center, corresponding to current price

val aggDf = dfWithBin
    .groupBy("bin_x")
    .aggregate { count() into "count" }                             // aggregated dataframe: bin center -> size of bin

aggDf
Out[6]:

DataFrame: rowsCount = 7, columnsCount = 2

bin_xcount
384,0000001997
768,00000013008
3072,00000010482
6144,00000011608
12288,0000006369
24576,000000979
1536,0000009497
In [7]:
letsPlot(aggDf.toMap()) {
    x = "bin_x"
    y = "count"
} + geomHistogram(
    stat = Stat.identity,
    breaks = breaks,
    color = "black",
    fill = "gray80"
) + xlab("price")
Out[7]:
0 5,000 10,000 15,000 20,000 25,000 30,000 0 2,000 4,000 6,000 8,000 10,000 12,000 count price