Back to Article
Scotch Whiskey Data Exploration Notebook
Download Notebook

Scotch Whiskey Data Exploration Notebook

Setup polars

Just increasing the size of the output to be able to read it easier.

In [161]:
import polars as pl

pl.Config.set_fmt_str_lengths(900)
pl.Config.set_tbl_width_chars(900)
polars.config.Config

Extract data from placemat

I manually input the data from the placemat into the list of dictionaries below. Where I made edits I also commented why the edits were made.

In [162]:
placement_data = [
    # 1998
    {"tasting_year": 1998, "rank": 1, "name": "Macallan (12)"},
    {"tasting_year": 1998, "rank": 2, "name": "Glenrothes (15)"},
    {"tasting_year": 1998, "rank": 3, "name": "Bowmore (15)"},
    {"tasting_year": 1998, "rank": 4, "name": "Glenkinchie (10)"},
    {"tasting_year": 1998, "rank": 5, "name": "Talisker (10)"},
    # 1999
    {"tasting_year": 1999, "rank": 1, "name": "Glenmorangie (18)"},
    {"tasting_year": 1999, "rank": 2, "name": "Macallan (18)"},
    {"tasting_year": 1999, "rank": 3, "name": "Tobermory (Ledaig 1974)"},
    {"tasting_year": 1999, "rank": 4, "name": "Scapa (12)"},
    {"tasting_year": 1999, "rank": 5, "name": "Bunnahabhain (12)"},
    # 2000
    {"tasting_year": 2000, "rank": 1, "name": "Glenmorangie (1971)"},
    {"tasting_year": 2000, "rank": 2, "name": "Highland Park (12)"},
    {"tasting_year": 2000, "rank": 3, "name": "Balvenie (15)"},
    {"tasting_year": 2000, "rank": 4, "name": "Dalwhinnie (15)"},
    # Assume Bowmaore was a typo and should be Bowmore.
    {"tasting_year": 2000, "rank": 5, "name": "Bowmore (12)"},
    # 2001
    {"tasting_year": 2001, "rank": 1, "name": "Macallan (25)"},
    {"tasting_year": 2001, "rank": 2, "name": "Glengoyne (17)"},
    {"tasting_year": 2001, "rank": 3, "name": "Knockando (12)"},
    {"tasting_year": 2001, "rank": 4, "name": "Oban (14)"},
    {"tasting_year": 2001, "rank": 5, "name": "Talisker (10)"},
    # 2002
    {"tasting_year": 2002, "rank": 1, "name": "Highland Park (1977)"},
    {"tasting_year": 2002, "rank": 2, "name": "Coleburn (21)"},
    {"tasting_year": 2002, "rank": 3, "name": "Glenglassaugh (1974)"},
    {"tasting_year": 2002, "rank": 4, "name": "Lagavulin (16)"},
    {"tasting_year": 2002, "rank": 5, "name": "Glenkinchie (Distiller's Edition)"},
    # 2003
    {"tasting_year": 2003, "rank": 1, "name": "Glenfiddich (21)"},
    {"tasting_year": 2003, "rank": 2, "name": "Teaninich (21)"},
    {"tasting_year": 2003, "rank": 3, "name": "St. Magdalene (25)"},
    {"tasting_year": 2003, "rank": 4, "name": "Ardbeg (17)"},
    {"tasting_year": 2003, "rank": 5, "name": "Linkwood (25)"},
    # 2004
    {"tasting_year": 2004, "rank": 1, "name": "Balvenie (25)"},
    {"tasting_year": 2004, "rank": 2, "name": "Macallan (1841 Replica)"},
    {"tasting_year": 2004, "rank": 3, "name": "Provenance (10)"},
    {"tasting_year": 2004, "rank": 4, "name": "Bowmore (10)"},
    {"tasting_year": 2004, "rank": 5, "name": "Old Pulteney (12)"},
    # 2005
    {"tasting_year": 2005, "rank": 1, "name": "Highland Park (25)"},
    {"tasting_year": 2005, "rank": 2, "name": "Glen Ord (12)"},
    {"tasting_year": 2005, "rank": 3, "name": "Inverleven (1986)"},
    {"tasting_year": 2005, "rank": 4, "name": "Tobermory (Mull Village 10)"},
    {"tasting_year": 2005, "rank": 5, "name": "Caol Ila (Cask Strength)"},
    # 2006
    {"tasting_year": 2006, "rank": 1, "name": "Poit Dhubh (12)"},
    {"tasting_year": 2006, "rank": 2, "name": "Balblair (16)"},
    {"tasting_year": 2006, "rank": 3, "name": "Glenburgie (Glencraig 29)"},
    {"tasting_year": 2006, "rank": 4, "name": "Auchentoshan (21)"},
    {"tasting_year": 2006, "rank": 5, "name": "Fettercairn (13)"},
    # 2007
    {"tasting_year": 2007, "rank": 1, "name": "Macallan (35)"},
    {"tasting_year": 2007, "rank": 2, "name": "Jura (Superstition ?)"},
    {"tasting_year": 2007, "rank": 3, "name": "Highland Park (30)"},
    {"tasting_year": 2007, "rank": 4, "name": "Glen Ord (25)"},
    {"tasting_year": 2007, "rank": 5, "name": "St. Magdalene (1975)"},
    # 2008
    {"tasting_year": 2008, "rank": 1, "name": "Glenfarclas (30)"},
    {"tasting_year": 2008, "rank": 2, "name": "Scapa (26)"},
    {"tasting_year": 2008, "rank": 3, "name": "Pittyvaich (1976)"},
    {"tasting_year": 2008, "rank": 4, "name": "Auchentoshan (18)"},
    {"tasting_year": 2008, "rank": 5, "name": "Arran (?)"},
    # 2009
    {"tasting_year": 2009, "rank": 1, "name": "Glenmorangie (25)"},
    # TODO: Is Glenrothes and Glenrothes the same?
    {"tasting_year": 2009, "rank": 2, "name": "Glenrothes (18)"},
    {"tasting_year": 2009, "rank": 3, "name": "Brora (24)"},
    {"tasting_year": 2009, "rank": 4, "name": "Glenfarclas (Blairfindy 31)"},
    {"tasting_year": 2009, "rank": 5, "name": "Arran (10 Burns' 250th Anniversary)"},
    # 2010
    {"tasting_year": 2010, "rank": 1, "name": "Glenlivet (XXV)"},
    {"tasting_year": 2010, "rank": 2, "name": "Tamdhu (18)"},
    {"tasting_year": 2010, "rank": 3, "name": "Rosebank (18)"},
    {"tasting_year": 2010, "rank": 4, "name": "Port Ellen (1982)"},
    {"tasting_year": 2010, "rank": 5, "name": "Old Pulteney (17)"},
    # 2011
    {"tasting_year": 2011, "rank": 1, "name": "Bruichladdich (16)"},
    {"tasting_year": 2011, "rank": 2, "name": "Glen Ord (Singleton 12)"},
    {"tasting_year": 2011, "rank": 3, "name": "Glenfarclas (40)"},
    {"tasting_year": 2011, "rank": 4, "name": "Scapa (16)"},
    {"tasting_year": 2011, "rank": 5, "name": "Bladnoch (1992)"},
    # 2012
    {"tasting_year": 2012, "rank": 1, "name": "Macallan (1969)"},
    {"tasting_year": 2012, "rank": 2, "name": "Balblair (1989)"},
    {"tasting_year": 2012, "rank": 3, "name": "Balvenie (21)"},
    {"tasting_year": 2012, "rank": 4, "name": "Highland Park (12)"},
    {"tasting_year": 2012, "rank": 5, "name": "Tobermory (15)"},
    # 2013
    {"tasting_year": 2013, "rank": 1, "name": "Glengoyne (21)"},
    {"tasting_year": 2013, "rank": 2, "name": "Glen Keith (1968)"},
    {"tasting_year": 2013, "rank": 3, "name": "Glen Grant (1974)"},
    {"tasting_year": 2013, "rank": 4, "name": "Jura (Prophency ?)"},
    {"tasting_year": 2013, "rank": 5, "name": "Linkwood (12)"},
    # 2014
    {"tasting_year": 2014, "rank": 1, "name": "Glenfiddich (30)"},
    {"tasting_year": 2014, "rank": 2, "name": "Highland Park (Loki)"},
    {"tasting_year": 2014, "rank": 3, "name": "Littlemill (21)"},
    {"tasting_year": 2014, "rank": 4, "name": "Mackinlay Shackleton"},
    {"tasting_year": 2014, "rank": 5, "name": "Kilchoman (3)"},
    # 2015
    {"tasting_year": 2015, "rank": 1, "name": "Arbelour (18)"},
    {"tasting_year": 2015, "rank": 2, "name": "Glenmorangie (Signet)"},
    {"tasting_year": 2015, "rank": 3, "name": "Macallan (Ruby)"},
    {"tasting_year": 2015, "rank": 4, "name": "Tomatin (1989)"},
    {"tasting_year": 2015, "rank": 5, "name": "Bunnahabhain (XXV)"},
    # 2016
    {"tasting_year": 2016, "rank": 1, "name": "Dalmore (King Alex III)"},
    {"tasting_year": 2016, "rank": 2, "name": "Mortlach (Rare Old)"},
    {"tasting_year": 2016, "rank": 3, "name": "Lismore (21)"},
    {"tasting_year": 2016, "rank": 4, "name": "Cragganmore (25)"},
    {"tasting_year": 2016, "rank": 5, "name": "Ardbeg (Corryvrechan)"},
    # 2017
    {"tasting_year": 2017, "rank": 1, "name": "Glendronach (21)"},
    {"tasting_year": 2017, "rank": 2, "name": "Bowmore (23)"},
    {"tasting_year": 2017, "rank": 3, "name": "Glenfarclas (12)"},
    {"tasting_year": 2017, "rank": 4, "name": "Talisker (25)"},
    {"tasting_year": 2017, "rank": 5, "name": "Tomintoul (16)"},
    # 2018
    {"tasting_year": 2018, "rank": 1, "name": "Craigellachie (23)"},
    {"tasting_year": 2018, "rank": 2, "name": "Royal Brackla (21)"},
    # Singleton, not Singletone
    {"tasting_year": 2018, "rank": 3, "name": "Glen Ord (Singleton Tailfire)"},
    {"tasting_year": 2018, "rank": 4, "name": "Benriach (1997)"},
    {"tasting_year": 2018, "rank": 5, "name": "Bruichladdich (Black Art)"},
    # 2019
    {"tasting_year": 2019, "rank": 1, "name": "Blair Athol (23)"},
    {"tasting_year": 2019, "rank": 2, "name": "Longmorn (16)"},
    {"tasting_year": 2019, "rank": 3, "name": "Arran (Sauternes)"},
    {"tasting_year": 2019, "rank": 4, "name": "Glenfiddich (Winter Storm)"},
    {"tasting_year": 2019, "rank": 5, "name": "Kilkerran (12)"},
    # 2020 - COVID
    # 2021 - COVID
    # 2022
    {"tasting_year": 2022, "rank": 1, "name": "Tullibardine (20)"},
    {"tasting_year": 2022, "rank": 2, "name": "Aberfeldy (19)"},
    {"tasting_year": 2022, "rank": 3, "name": "Deanston (18)"},
    {"tasting_year": 2022, "rank": 4, "name": "Jura (Tastival 2017)"},
    {"tasting_year": 2022, "rank": 5, "name": "Glen Grant (18)"},
    # 2023
    {"tasting_year": 2023, "rank": 1, "name": "Glenfiddich (23)"},
    {"tasting_year": 2023, "rank": 2, "name": "Bladnoch (Talia)"},
    {"tasting_year": 2023, "rank": 3, "name": "Bunnahabhain (12)"},
    {"tasting_year": 2023, "rank": 4, "name": "Highland Park (15)"},
    {"tasting_year": 2023, "rank": 5, "name": "Ben Nevis (6)"},
]
In [163]:
# Convert the list of dictionaries to a DataFrame.
scotch_yearly_rankings = pl.DataFrame(placement_data)
In [164]:
# Extract more information from the column.
scotch_yearly_rankings = scotch_yearly_rankings.with_columns(
    pl.col("name").str.extract(r"([\.\w\s]+) ", 1).alias("distillery"),
    pl.col("name")
    .str.extract(r"([1-4]?[0-9])", 1)
    .cast(pl.Int32)
    .alias("age_statement"),
    pl.col("name").str.extract(r"(\d{4})", 1).cast(pl.Int32).alias("bottling_year"),
    pl.col("name").str.extract(r"(\(.*\))", 1).alias("metadata"),
    (6 - pl.col("rank")).alias("stars"),
)

Extract distillery info from Wikipedia

We want more information about each distillery for our analysis. Wikipedia includes a table which of existing and past dead Scotch distilleries. It’s not a complete list, but it’s enough to get started with.

In [165]:
from urllib.parse import urljoin

import requests

base_url = "https://en.wikipedia.org/"
path = "/wiki/List_of_whisky_distilleries_in_Scotland"
response = requests.get(urljoin(base_url, path))
In [166]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(response.text, "html.parser")
distilleries_table = soup.find("table", {"class": "wikitable"})
distilleries_rows = distilleries_table.find("tbody").find_all("tr")

distillery_data = []
for row in distilleries_rows:
    cells = row.find_all("td")
    if not cells:
        continue

    distillery_data.append(
        {
            "distillery": cells[0].text.strip(),
            "location": cells[1].text.strip(),
            "region": cells[2].text.strip(),
            "founded": cells[3].text.strip(),
            "owner": cells[4].text.strip(),
        }
    )
In [167]:
# Some distilleries are missing from the Wikipedia page. For those, I manually searched
# online to find the missing information.
manually_added_distilleries = [
    {
        "distillery": "Old Pulteney",
        "location": "Wick",
        "region": "Highland",
        "founded": "1826",
        "owner": "Inver House Distillers",
    },
    {"distillery": "Poit Dhubh", "location": "Isle of Skye", "region": "Island"},
    {
        "distillery": "Lismore",
        "region": "Speyside",
    },
    {
        "distillery": "Provenance",
        "region": "Unknown",
    },
    {
        "distillery": "Mull Village",
        "region": "Island",
    },
    {
        "distillery": "Mackinlay",
        "region": "Highland",
        "founded": "1847",
    },
    {
        "distillery": "Arbelour",
        "location": "Arbelour",
        "region": "Speyside",
        "founded": "1879",
        "owner": "Chivas Brothers",
    },
    {
        "distillery": "Benriach",
        "location": "Moray",
        "region": "Speyside",
        "founded": "1898",
        "owner": "Brown-Forman",
    },
    {
        "distillery": "Kilkerran",
        "location": "Campbeltown",
        "region": "Campbeltown",
        "founded": "1872",
        "owner": "Mitchell's Glengyle Ltd",
    },
    {
        "distillery": "Aberfeldy",
        "location": "Aberfeldy",
        "region": "Highland",
        "founded": "1896",
        "owner": "Bacardi",
    },
]
In [168]:
dead_distilleries = []

soup = BeautifulSoup(response.text, "html.parser")
dead_distilleries_table = soup.find_all("table", {"class": "wikitable"})[2]
dead_distillery_rows = dead_distilleries_table.find("tbody").find_all("tr")
for row in dead_distillery_rows:
    cells = row.find_all("td")
    if not cells:
        continue

    dead_distilleries.append(
        {
            "distillery": cells[0].text.strip(),
            "location": cells[1].text.strip(),
            "region": cells[2].text.strip(),
            "year_closed": cells[3].text.strip(),
        }
    )
In [169]:
# Combine all the distillery data into a single DataFrame.
distilleries_df = pl.DataFrame(
    distillery_data + dead_distilleries + manually_added_distilleries,
    infer_schema_length=1000,
)

Join distillery info with placemat data

With the distillery data, we can now join it with the data from the placemat.

In [170]:
enhanced_df = scotch_yearly_rankings.join(distilleries_df, on="distillery", how="left")

Start exploring the data

This is the fun part! Used seaborn to create the plots.

In [171]:
import seaborn as sns

sns.set_theme()
In [172]:
region_aggregate = (
    enhanced_df.groupby("region")
    .agg(
        pl.col("distillery").count().alias("count"),
        pl.col("stars").mean().alias("stars_mean"),
        pl.col("stars").sum().alias("stars_total"),
    )
    .sort("stars_mean", descending=True)
)
/var/folders/4l/q3fxbjh106d0tz36zw1smt480000gn/T/ipykernel_83160/1441878289.py:2: DeprecationWarning: `groupby` is deprecated. It has been renamed to `group_by`.
  enhanced_df.groupby("region")
In [173]:
ax = sns.barplot(data=region_aggregate, x="region", y="stars_mean")
ax.set_ylim(0, 5)
ax.set_title("Average stars per region")
for i in ax.containers:
    ax.bar_label(i)

Average stars per Scotch whiskey region.
In [174]:
distillery_aggregate = enhanced_df.group_by("distillery").agg(
    pl.col("stars").mean().alias("stars_mean"),
    pl.col("stars").sum().alias("stars_total"),
    pl.col("stars").count().alias("count"),
)
In [175]:
top_5_distilleries_by_stars_total = distillery_aggregate.sort(
    "stars_total", descending=True
).head(5)

ax = sns.barplot(
    data=top_5_distilleries_by_stars_total, x="distillery", y="stars_total"
)
ax.set_title("Sum of stars for each distillery")
for i in ax.containers:
    ax.bar_label(i)

Number of stars earned for each distillery in descending order.