Back to Article
Scotch Whiskey Data Exploration Notebook
Download Notebook

Scotch Whiskey Data Exploration Notebook

In [1]:
import pandas as pd
from typing import Any

Extract data from placemat

I manually input the data from the placemat into the list of dictionaries below. Where I made edits I also commented why the edits were made.

In [2]:
placement_data: list[dict[str, Any]] = [
    # 1998
    {"tasting_year": 1998, "rank": 1, "name": "Macallan (12)"},
    {"tasting_year": 1998, "rank": 2, "name": "Glenrothes (15)"},
    {"tasting_year": 1998, "rank": 3, "name": "Bowmore (15)"},
    {"tasting_year": 1998, "rank": 4, "name": "Glenkinchie (10)"},
    {"tasting_year": 1998, "rank": 5, "name": "Talisker (10)"},
    # 1999
    {"tasting_year": 1999, "rank": 1, "name": "Glenmorangie (18)"},
    {"tasting_year": 1999, "rank": 2, "name": "Macallan (18)"},
    {"tasting_year": 1999, "rank": 3, "name": "Tobermory (Ledaig 1974)"},
    {"tasting_year": 1999, "rank": 4, "name": "Scapa (12)"},
    {"tasting_year": 1999, "rank": 5, "name": "Bunnahabhain (12)"},
    # 2000
    {"tasting_year": 2000, "rank": 1, "name": "Glenmorangie (1971)"},
    {"tasting_year": 2000, "rank": 2, "name": "Highland Park (12)"},
    {"tasting_year": 2000, "rank": 3, "name": "Balvenie (15)"},
    {"tasting_year": 2000, "rank": 4, "name": "Dalwhinnie (15)"},
    # Assume Bowmaore was a typo and should be Bowmore.
    {"tasting_year": 2000, "rank": 5, "name": "Bowmore (12)"},
    # 2001
    {"tasting_year": 2001, "rank": 1, "name": "Macallan (25)"},
    {"tasting_year": 2001, "rank": 2, "name": "Glengoyne (17)"},
    {"tasting_year": 2001, "rank": 3, "name": "Knockando (12)"},
    {"tasting_year": 2001, "rank": 4, "name": "Oban (14)"},
    {"tasting_year": 2001, "rank": 5, "name": "Talisker (10)"},
    # 2002
    {"tasting_year": 2002, "rank": 1, "name": "Highland Park (1977)"},
    {"tasting_year": 2002, "rank": 2, "name": "Coleburn (21)"},
    {"tasting_year": 2002, "rank": 3, "name": "Glenglassaugh (1974)"},
    {"tasting_year": 2002, "rank": 4, "name": "Lagavulin (16)"},
    {"tasting_year": 2002, "rank": 5, "name": "Glenkinchie (Distiller's Edition)"},
    # 2003
    {"tasting_year": 2003, "rank": 1, "name": "Glenfiddich (21)"},
    {"tasting_year": 2003, "rank": 2, "name": "Teaninich (21)"},
    {"tasting_year": 2003, "rank": 3, "name": "St. Magdalene (25)"},
    {"tasting_year": 2003, "rank": 4, "name": "Ardbeg (17)"},
    {"tasting_year": 2003, "rank": 5, "name": "Linkwood (25)"},
    # 2004
    {"tasting_year": 2004, "rank": 1, "name": "Balvenie (25)"},
    {"tasting_year": 2004, "rank": 2, "name": "Macallan (1841 Replica)"},
    {"tasting_year": 2004, "rank": 3, "name": "Provenance (10)"},
    {"tasting_year": 2004, "rank": 4, "name": "Bowmore (10)"},
    {"tasting_year": 2004, "rank": 5, "name": "Old Pulteney (12)"},
    # 2005
    {"tasting_year": 2005, "rank": 1, "name": "Highland Park (25)"},
    {"tasting_year": 2005, "rank": 2, "name": "Glen Ord (12)"},
    {"tasting_year": 2005, "rank": 3, "name": "Inverleven (1986)"},
    {"tasting_year": 2005, "rank": 4, "name": "Tobermory (Mull Village 10)"},
    {"tasting_year": 2005, "rank": 5, "name": "Caol Ila (Cask Strength)"},
    # 2006
    {"tasting_year": 2006, "rank": 1, "name": "Poit Dhubh (12)"},
    {"tasting_year": 2006, "rank": 2, "name": "Balblair (16)"},
    {"tasting_year": 2006, "rank": 3, "name": "Glenburgie (Glencraig 29)"},
    {"tasting_year": 2006, "rank": 4, "name": "Auchentoshan (21)"},
    {"tasting_year": 2006, "rank": 5, "name": "Fettercairn (13)"},
    # 2007
    {"tasting_year": 2007, "rank": 1, "name": "Macallan (35)"},
    {"tasting_year": 2007, "rank": 2, "name": "Jura (Superstition ?)"},
    {"tasting_year": 2007, "rank": 3, "name": "Highland Park (30)"},
    {"tasting_year": 2007, "rank": 4, "name": "Glen Ord (25)"},
    {"tasting_year": 2007, "rank": 5, "name": "St. Magdalene (1975)"},
    # 2008
    {"tasting_year": 2008, "rank": 1, "name": "Glenfarclas (30)"},
    {"tasting_year": 2008, "rank": 2, "name": "Scapa (26)"},
    {"tasting_year": 2008, "rank": 3, "name": "Pittyvaich (1976)"},
    {"tasting_year": 2008, "rank": 4, "name": "Auchentoshan (18)"},
    {"tasting_year": 2008, "rank": 5, "name": "Arran (?)"},
    # 2009
    {"tasting_year": 2009, "rank": 1, "name": "Glenmorangie (25)"},
    # TODO: Is Glenrothes and Glenrothes the same?
    {"tasting_year": 2009, "rank": 2, "name": "Glenrothes (18)"},
    {"tasting_year": 2009, "rank": 3, "name": "Brora (24)"},
    {"tasting_year": 2009, "rank": 4, "name": "Glenfarclas (Blairfindy 31)"},
    {"tasting_year": 2009, "rank": 5, "name": "Arran (10 Burns' 250th Anniversary)"},
    # 2010
    {"tasting_year": 2010, "rank": 1, "name": "Glenlivet (XXV)"},
    {"tasting_year": 2010, "rank": 2, "name": "Tamdhu (18)"},
    {"tasting_year": 2010, "rank": 3, "name": "Rosebank (18)"},
    {"tasting_year": 2010, "rank": 4, "name": "Port Ellen (1982)"},
    {"tasting_year": 2010, "rank": 5, "name": "Old Pulteney (17)"},
    # 2011
    {"tasting_year": 2011, "rank": 1, "name": "Bruichladdich (16)"},
    {"tasting_year": 2011, "rank": 2, "name": "Glen Ord (Singleton 12)"},
    {"tasting_year": 2011, "rank": 3, "name": "Glenfarclas (40)"},
    {"tasting_year": 2011, "rank": 4, "name": "Scapa (16)"},
    {"tasting_year": 2011, "rank": 5, "name": "Bladnoch (1992)"},
    # 2012
    {"tasting_year": 2012, "rank": 1, "name": "Macallan (1969)"},
    {"tasting_year": 2012, "rank": 2, "name": "Balblair (1989)"},
    {"tasting_year": 2012, "rank": 3, "name": "Balvenie (21)"},
    {"tasting_year": 2012, "rank": 4, "name": "Highland Park (12)"},
    {"tasting_year": 2012, "rank": 5, "name": "Tobermory (15)"},
    # 2013
    {"tasting_year": 2013, "rank": 1, "name": "Glengoyne (21)"},
    {"tasting_year": 2013, "rank": 2, "name": "Glen Keith (1968)"},
    {"tasting_year": 2013, "rank": 3, "name": "Glen Grant (1974)"},
    {"tasting_year": 2013, "rank": 4, "name": "Jura (Prophency ?)"},
    {"tasting_year": 2013, "rank": 5, "name": "Linkwood (12)"},
    # 2014
    {"tasting_year": 2014, "rank": 1, "name": "Glenfiddich (30)"},
    {"tasting_year": 2014, "rank": 2, "name": "Highland Park (Loki)"},
    {"tasting_year": 2014, "rank": 3, "name": "Littlemill (21)"},
    {"tasting_year": 2014, "rank": 4, "name": "Mackinlay Shackleton"},
    {"tasting_year": 2014, "rank": 5, "name": "Kilchoman (3)"},
    # 2015
    {"tasting_year": 2015, "rank": 1, "name": "Arbelour (18)"},
    {"tasting_year": 2015, "rank": 2, "name": "Glenmorangie (Signet)"},
    {"tasting_year": 2015, "rank": 3, "name": "Macallan (Ruby)"},
    {"tasting_year": 2015, "rank": 4, "name": "Tomatin (1989)"},
    {"tasting_year": 2015, "rank": 5, "name": "Bunnahabhain (XXV)"},
    # 2016
    {"tasting_year": 2016, "rank": 1, "name": "Dalmore (King Alex III)"},
    {"tasting_year": 2016, "rank": 2, "name": "Mortlach (Rare Old)"},
    {"tasting_year": 2016, "rank": 3, "name": "Lismore (21)"},
    {"tasting_year": 2016, "rank": 4, "name": "Cragganmore (25)"},
    {"tasting_year": 2016, "rank": 5, "name": "Ardbeg (Corryvrechan)"},
    # 2017
    {"tasting_year": 2017, "rank": 1, "name": "Glendronach (21)"},
    {"tasting_year": 2017, "rank": 2, "name": "Bowmore (23)"},
    {"tasting_year": 2017, "rank": 3, "name": "Glenfarclas (12)"},
    {"tasting_year": 2017, "rank": 4, "name": "Talisker (25)"},
    {"tasting_year": 2017, "rank": 5, "name": "Tomintoul (16)"},
    # 2018
    {"tasting_year": 2018, "rank": 1, "name": "Craigellachie (23)"},
    {"tasting_year": 2018, "rank": 2, "name": "Royal Brackla (21)"},
    # Singleton, not Singletone
    {"tasting_year": 2018, "rank": 3, "name": "Glen Ord (Singleton Tailfire)"},
    {"tasting_year": 2018, "rank": 4, "name": "Benriach (1997)"},
    {"tasting_year": 2018, "rank": 5, "name": "Bruichladdich (Black Art)"},
    # 2019
    {"tasting_year": 2019, "rank": 1, "name": "Blair Athol (23)"},
    {"tasting_year": 2019, "rank": 2, "name": "Longmorn (16)"},
    {"tasting_year": 2019, "rank": 3, "name": "Arran (Sauternes)"},
    {"tasting_year": 2019, "rank": 4, "name": "Glenfiddich (Winter Storm)"},
    {"tasting_year": 2019, "rank": 5, "name": "Kilkerran (12)"},
    # 2020 - COVID
    # 2021 - COVID
    # 2022
    {"tasting_year": 2022, "rank": 1, "name": "Tullibardine (20)"},
    {"tasting_year": 2022, "rank": 2, "name": "Aberfeldy (19)"},
    {"tasting_year": 2022, "rank": 3, "name": "Deanston (18)"},
    {"tasting_year": 2022, "rank": 4, "name": "Jura (Tastival 2017)"},
    {"tasting_year": 2022, "rank": 5, "name": "Glen Grant (18)"},
    # 2023
    {"tasting_year": 2023, "rank": 1, "name": "Glenfiddich (23)"},
    {"tasting_year": 2023, "rank": 2, "name": "Bladnoch (Talia)"},
    {"tasting_year": 2023, "rank": 3, "name": "Bunnahabhain (12)"},
    {"tasting_year": 2023, "rank": 4, "name": "Highland Park (15)"},
    {"tasting_year": 2023, "rank": 5, "name": "Ben Nevis (6)"},
]
In [3]:
# Convert the list of dictionaries to a DataFrame.
scotch_yearly_rankings = pd.DataFrame(placement_data)
In [4]:
# Extract more information from the column.
scotch_yearly_rankings["distillery"] = scotch_yearly_rankings["name"].str.extract(
    r"([\.\w\s]+) ", 1
)
scotch_yearly_rankings["age_statement"] = (
    scotch_yearly_rankings["name"].str.extract(r"([1-4]?[0-9])", 1).astype("Int64")
)
scotch_yearly_rankings["bottling_year"] = (
    scotch_yearly_rankings["name"].str.extract(r"(\d{4})", 1).astype("Int64")
)
scotch_yearly_rankings["metadata"] = scotch_yearly_rankings["name"].str.extract(
    r"(\(.*\))", 1
)
scotch_yearly_rankings["stars"] = 6 - scotch_yearly_rankings["rank"]
scotch_yearly_rankings
tasting_year rank name distillery age_statement bottling_year metadata stars
0 1998 1 Macallan (12) Macallan 12 <NA> (12) 5
1 1998 2 Glenrothes (15) Glenrothes 15 <NA> (15) 4
2 1998 3 Bowmore (15) Bowmore 15 <NA> (15) 3
3 1998 4 Glenkinchie (10) Glenkinchie 10 <NA> (10) 2
4 1998 5 Talisker (10) Talisker 10 <NA> (10) 1
... ... ... ... ... ... ... ... ...
115 2023 1 Glenfiddich (23) Glenfiddich 23 <NA> (23) 5
116 2023 2 Bladnoch (Talia) Bladnoch <NA> <NA> (Talia) 4
117 2023 3 Bunnahabhain (12) Bunnahabhain 12 <NA> (12) 3
118 2023 4 Highland Park (15) Highland Park 15 <NA> (15) 2
119 2023 5 Ben Nevis (6) Ben Nevis 6 <NA> (6) 1

120 rows × 8 columns

Extract distillery info from Wikipedia

We want more information about each distillery for our analysis. Wikipedia includes a table which of existing and past dead Scotch distilleries. It’s not a complete list, but it’s enough to get started with.

In [5]:
from urllib.parse import urljoin

import requests

base_url = "https://en.wikipedia.org/"
path = "/wiki/List_of_whisky_distilleries_in_Scotland"
headers = {
    "User-Agent": f"Bot/0.0 (https://lennoxstevenson.com lennox.stevenson@gmail.com) requests/{requests.__version__}"
}
response = requests.get(urljoin(base_url, path), headers=headers)
In [6]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(response.text, "html.parser")
distilleries_table = soup.find("table", {"class": "wikitable"})
distilleries_rows = distilleries_table.find("tbody").find_all("tr")

distillery_data: list[dict[str, Any]] = []
for row in distilleries_rows:
    cells = row.find_all("td")
    if not cells:
        continue

    distillery_data.append(
        {
            "distillery": cells[0].text.strip(),
            "location": cells[1].text.strip(),
            "region": cells[2].text.strip(),
            "founded": cells[3].text.strip(),
            "owner": cells[4].text.strip(),
        }
    )
In [7]:
# Some distilleries are missing from the Wikipedia page. For those, I manually searched
# online to find the missing information.
manually_added_distilleries = [
    {
        "distillery": "Old Pulteney",
        "location": "Wick",
        "region": "Highland",
        "founded": "1826",
        "owner": "Inver House Distillers",
    },
    {"distillery": "Poit Dhubh", "location": "Isle of Skye", "region": "Island"},
    {
        "distillery": "Lismore",
        "region": "Speyside",
    },
    {
        "distillery": "Provenance",
        "region": "Unknown",
    },
    {
        "distillery": "Mull Village",
        "region": "Island",
    },
    {
        "distillery": "Mackinlay",
        "region": "Highland",
        "founded": "1847",
    },
    {
        "distillery": "Arbelour",
        "location": "Arbelour",
        "region": "Speyside",
        "founded": "1879",
        "owner": "Chivas Brothers",
    },
    {
        "distillery": "Benriach",
        "location": "Moray",
        "region": "Speyside",
        "founded": "1898",
        "owner": "Brown-Forman",
    },
    {
        "distillery": "Kilkerran",
        "location": "Campbeltown",
        "region": "Campbeltown",
        "founded": "1872",
        "owner": "Mitchell's Glengyle Ltd",
    },
    {
        "distillery": "Aberfeldy",
        "location": "Aberfeldy",
        "region": "Highland",
        "founded": "1896",
        "owner": "Bacardi",
    },
]
In [8]:
dead_distilleries: list[dict[str, Any]] = []

soup = BeautifulSoup(response.text, "html.parser")
dead_distilleries_table = soup.find_all("table", {"class": "wikitable"})[2]
dead_distillery_rows = dead_distilleries_table.find("tbody").find_all("tr")
for row in dead_distillery_rows:
    cells = row.find_all("td")
    if not cells:
        continue

    dead_distilleries.append(
        {
            "distillery": cells[0].text.strip(),
            "location": cells[1].text.strip(),
            "region": cells[2].text.strip(),
            "year_closed": cells[3].text.strip(),
        }
    )
In [9]:
# Combine all the distillery data into a single DataFrame.
distilleries_df = pd.DataFrame(
    distillery_data + dead_distilleries + manually_added_distilleries
)
distilleries_df
distillery location region founded owner year_closed
0 Aberargie Aberargie Lowland 2017 Perth Distilling Co NaN
1 Aberfeldy Aberfeldy Highland 1896 John Dewar & Sons NaN
2 Aberlour Aberlour Speyside 1879 Chivas Brothers NaN
3 Abhainn Dearg Uig, Isle of Lewis Island 2008 Mark Tayburn NaN
4 Ailsa Bay Girvan Lowland 2009 William Grant & Sons NaN
... ... ... ... ... ... ...
247 Mackinlay NaN Highland 1847 NaN NaN
248 Arbelour Arbelour Speyside 1879 Chivas Brothers NaN
249 Benriach Moray Speyside 1898 Brown-Forman NaN
250 Kilkerran Campbeltown Campbeltown 1872 Mitchell's Glengyle Ltd NaN
251 Aberfeldy Aberfeldy Highland 1896 Bacardi NaN

252 rows × 6 columns

Join distillery info with placemat data

With the distillery data, we can now join it with the data from the placemat.

In [10]:
enhanced_df = scotch_yearly_rankings.merge(distilleries_df, on="distillery", how="left")
enhanced_df
tasting_year rank name distillery age_statement bottling_year metadata stars location region founded owner year_closed
0 1998 1 Macallan (12) Macallan 12 <NA> (12) 5 Craigellachie Speyside 1824 Edrington NaN
1 1998 2 Glenrothes (15) Glenrothes 15 <NA> (15) 4 Rothes Speyside 1879 Edrington NaN
2 1998 3 Bowmore (15) Bowmore 15 <NA> (15) 3 Bowmore Islay 1779 Suntory Global Spirits NaN
3 1998 4 Glenkinchie (10) Glenkinchie 10 <NA> (10) 2 Pencaitland Lowland 1837 Diageo NaN
4 1998 5 Talisker (10) Talisker 10 <NA> (10) 1 Carbost, Isle of Skye Island 1830 Diageo NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ...
116 2023 1 Glenfiddich (23) Glenfiddich 23 <NA> (23) 5 Dufftown Speyside 1886 William Grant & Sons NaN
117 2023 2 Bladnoch (Talia) Bladnoch <NA> <NA> (Talia) 4 Wigtown Lowland 1817 David Prior NaN
118 2023 3 Bunnahabhain (12) Bunnahabhain 12 <NA> (12) 3 Port Askaig Islay 1881 Distell NaN
119 2023 4 Highland Park (15) Highland Park 15 <NA> (15) 2 Kirkwall Island 1798 Edrington NaN
120 2023 5 Ben Nevis (6) Ben Nevis 6 <NA> (6) 1 Fort William Highland 1825 Nikka Whisky Distilling Co Ltd NaN

121 rows × 13 columns

Start exploring the data

This is the fun part! Used seaborn to create the plots.

In [11]:
import seaborn as sns

sns.set_theme()
In [12]:
region_aggregate = (
    enhanced_df.groupby("region", as_index=False)
    .agg(
        count=("distillery", "count"),
        stars_mean=("stars", "mean"),
        stars_total=("stars", "sum"),
    )
    .sort_values("stars_mean", ascending=False)
)
region_aggregate
region count stars_mean stars_total
5 Speyside 39 3.538462 138
1 Highland 31 3.419355 106
6 Unknown 1 3.000000 3
2 Island 20 2.800000 56
4 Lowland 11 2.272727 25
3 Islay 15 2.000000 30
0 Campbeltown 1 1.000000 1
In [13]:
ax = sns.barplot(data=region_aggregate, x="region", y="stars_mean")
ax.set_ylim(0, 5)
ax.set_title("Average stars per region")
for i in ax.containers:
    ax.bar_label(i)

Average stars per Scotch whiskey region.
In [14]:
distillery_aggregate = enhanced_df.groupby("distillery", as_index=False).agg(
    stars_mean=("stars", "mean"),
    stars_total=("stars", "sum"),
    count=("stars", "count"),
)
distillery_aggregate
distillery stars_mean stars_total count
0 Aberfeldy 4.000000 8 2
1 Arbelour 5.000000 5 1
2 Ardbeg 1.500000 3 2
3 Arran 1.666667 5 3
4 Auchentoshan 2.000000 4 2
... ... ... ... ...
62 Teaninich 4.000000 4 1
63 Tobermory 2.000000 6 3
64 Tomatin 2.000000 2 1
65 Tomintoul 1.000000 1 1
66 Tullibardine 5.000000 5 1

67 rows × 4 columns

In [15]:
top_5_distilleries_by_stars_total = distillery_aggregate.sort_values(
    "stars_total", ascending=False
).head(5)

ax = sns.barplot(
    data=top_5_distilleries_by_stars_total, x="distillery", y="stars_total"
)
ax.set_title("Sum of stars for each distillery")
for i in ax.containers:
    ax.bar_label(i)

Number of stars earned for each distillery in descending order.