Show the code
import requests
from bs4 import BeautifulSoup
import pandas as pd
print("All packages uploaded successfully!")
## All packages uploaded successfully!
I have a keen interest in coffee, so wanted to complete a project that focused on some of the strongest coffee in the world - Vietnamese! It’s a simple project but it demonstrates skills in JavaScript and how to show data in tables more effectively.
Coffee is one of the highest volume traded commodities around the world, particularly in developing countries in Africa, South America and Asia. Analysing coffee export data is crucial for policymakers and manufacturers to be aware of trends in the global industry and monitor market trends, forecast demand and make informed business decisions.
Web scraping is an efficient method to collect data from different sources other than traditional APIs or databases, particularly as data and trends are regularly shared on websites for public consumptions. Web scraping on industry publications is a good example of leveraging these techniques to collect crucial insights for agricultural sectors.
This project will use data from the-shiv - a website that analyses trends in Vietnam’s economy (particularly sectors focussed on import/export). We use Python libraries such as requests
and BeautifulSoup
to extract the information, and pandas
to prepare and clean the data.
import requests
from bs4 import BeautifulSoup
import pandas as pd
print("All packages uploaded successfully!")
## All packages uploaded successfully!
Below is the code to scrape the-shiv
website by first collecting data from all tables available (i.e. the tr
html tag):
# Define the URL for the organic farms directory (example URL)
= 'https://the-shiv.com/vietnams-coffee-industry/'
url
= requests.get(url)
response = BeautifulSoup(response.text, 'html.parser')
soup = soup.find_all('table')
tables print(f"Found {len(tables)} table(s) on the page.")
## Found 4 table(s) on the page.
= []
df_list
# Loop through each table and extract the data
for i, table in enumerate(tables):
= []
rows # Find all table rows
for row in table.find_all('tr'):
# For each row, find both header and data cells
= row.find_all(['th', 'td'])
cols # Get text from each cell and strip extra whitespace
= [col.get_text(strip=True) for col in cols]
cols if cols: # Only add rows that have data
rows.append(cols)
# Create a DataFrame from the extracted data
= pd.DataFrame(rows)
df
# Optional: If the first row is a header, set it as the DataFrame header
if not df.empty:
0,0] = 'Country'
df.iloc[= df.iloc[0,:] # set first row as header
df.columns = df[1:].reset_index(drop=True) # remove the header row from the data
df
df_list.append(df)
Next, we will append a two letter country code that is used on the flagcdn
website to reference the country’s flag icon (see tables below). We also clean the data to convert large numbers with commas into float
data types. This data is then exported to json files, to be read into Observable JS
.
# Get Flag icon reference data
import urllib.request, json
with urllib.request.urlopen('https://flagcdn.com/en/codes.json') as url:
= json.load(url)
icon_df
= pd.DataFrame(list(icon_df.items()), columns=['code', 'Country'])
merge_df = merge_df.astype(str).drop_duplicates(subset = ['Country'])
merge_df
def prepare_data(data):
# Pull table data from list
= data.reset_index(drop = True)
data 'Country'] = data['Country'].astype(str)
data['Country'] = data['Country'].str.replace('USA','United States')
data['Country'] = data['Country'].str.replace('UK','United Kingdom')
data[# data[['2023','2024']] = data[['2023','2024']].map(lambda x: float(x.replace(',','')) / 1000)
= data.drop_duplicates(subset = ['Country'])
data
# Merge data
= pd.merge(data, merge_df, on='Country', how='left')
df
return df
# Save dollars .json file
= prepare_data(df_list[1])
df "data-dol.json", orient="records", index = 2)
df.to_json(
# Save volume .json file
= prepare_data(df_list[3])
df "data-vol.json", orient="records", index = 2) df.to_json(
Now, the json files are read into Observable JS
and tables are created. You can click the header of each variable in the tables to sort the data both ascending and descending.
function createSortableTable(data) {
// If data is not an array, assume it's an object and convert to array.
if (!Array.isArray(data)) {
= Object.values(data);
data
}
// Separate out the Total row (if it exists) and the other rows.
let totalRow = data.find(d => String(d.Country).toLowerCase() === "total");
let otherData = data.filter(d => String(d.Country).toLowerCase() !== "total");
// Mapping between header names and data keys.
const dataKeyMapping = {
"Country": "Country",
"2023": "2023",
"2024": "2024",
"Change": "Change"
;
}
// Helper function to clean up numeric strings.
const cleanNumber = val => {
if (typeof val === "string") {
let cleaned = val.replace(/[,]/g, "").trim();
= cleaned.replace(/[%$]/g, "");
cleaned return parseFloat(cleaned);
}return val;
;
}
// New helper function to format numbers with commas and two decimals.
function formatNumber(val) {
const num = parseFloat(val);
return isNaN(num)
? val
: num.toLocaleString('en-US', {
minimumFractionDigits: 2,
maximumFractionDigits: 2
;
})
}
// Create a container for the table.
const container = html`<div style="font-family: sans-serif;"></div>`;
// Create the table element.
const table = html`<table border="1" cellpadding="5" cellspacing="0"
style="border-collapse: collapse; width: 100%;"></table>`;
// Build table header with clickable sorting functionality.
const thead = table.createTHead();
const headerRow = thead.insertRow();
const columns = ["Country", "2023", "2024", "Change"];
// Track current sort column and order.
let sortColumn = null;
let sortAsc = true;
const headerCells = [];
.forEach(col => {
columnsconst th = document.createElement("th");
.style.padding = "8px";
th.style.border = "1px solid #ccc";
th.style.backgroundColor = "#f9f9f9";
th.style.textAlign = "left";
th.style.cursor = "pointer";
th
const colText = document.createTextNode(col);
.appendChild(colText);
th
// Add a span for the sort icon.
const iconSpan = document.createElement("span");
.style.marginLeft = "5px";
iconSpan.textContent = "⇅"; // default unsorted icon
iconSpan.appendChild(iconSpan);
th
.addEventListener("click", () => {
thif (sortColumn === col) {
= !sortAsc;
sortAsc else {
} = col;
sortColumn = true;
sortAsc
}renderTableBody();
updateHeaderIcons();
;
}).appendChild(th);
headerRow.push({ col, iconSpan });
headerCells;
})
// Create table body element.
const tbody = table.createTBody();
function updateHeaderIcons() {
.forEach(({ col, iconSpan }) => {
headerCellsif (sortColumn === col) {
.textContent = sortAsc ? "↑" : "↓";
iconSpanelse {
} .textContent = "⇅";
iconSpan
};
})
}
function renderTableBody() {
.innerHTML = "";
tbody
// Make a copy of non-Total rows.
let sortedData = [...otherData];
if (sortColumn) {
.sort((a, b) => {
sortedDatalet key = dataKeyMapping[sortColumn];
if (sortColumn === "2023" || sortColumn === "2024") {
const aNum = cleanNumber(a[key]);
const bNum = cleanNumber(b[key]);
return sortAsc ? aNum - bNum : bNum - aNum;
else {
} let aVal = a[key];
let bVal = b[key];
if (!isNaN(parseFloat(aVal)) && !isNaN(parseFloat(bVal))) {
= parseFloat(aVal);
aVal = parseFloat(bVal);
bVal else {
} = String(aVal).toLowerCase();
aVal = String(bVal).toLowerCase();
bVal
}if (aVal < bVal) return sortAsc ? -1 : 1;
if (aVal > bVal) return sortAsc ? 1 : -1;
return 0;
};
})
}
// Render the Total row first (anchored at the top and bolded).
if (totalRow) {
const row = tbody.insertRow();
.style.fontWeight = "bold";
row
// Country cell with flag image if available.
const countryCell = row.insertCell();
.style.padding = "8px";
countryCell.style.border = "1px solid #ccc";
countryCell.style.backgroundColor = "#fffff4";
countryCelllet flag = "";
if (totalRow.code) {
= `<img src="https://flagcdn.com/24x18/${totalRow.code}.png" alt="${totalRow.Country} flag"
flag style="vertical-align: middle; margin-right: 5px;">`;
}.innerHTML = `${flag}${totalRow.Country}`;
countryCell
// 2023 cell.
const cell2023 = row.insertCell();
.style.padding = "8px";
cell2023.style.border = "1px solid #ccc";
cell2023.style.backgroundColor = "#e0f7fa";
cell2023.textContent = formatNumber(totalRow["2023"]);
cell2023
// 2024 cell.
const cell2024 = row.insertCell();
.style.padding = "8px";
cell2024.style.border = "1px solid #ccc";
cell2024.style.backgroundColor = "#e8f5e9";
cell2024.textContent = formatNumber(totalRow["2024"]);
cell2024
// Change cell.
const changeCell = row.insertCell();
.style.padding = "8px";
changeCell.style.border = "1px solid #ccc";
changeCell.style.textAlign = "right";
changeCell.style.backgroundColor = "#fffff4";
changeCell.style.color = totalRow.Change >= 0 ? "green" : "red";
changeCell.textContent = totalRow.Change;
changeCell
}
// Render the remaining rows.
.forEach(d => {
sortedDataconst row = tbody.insertRow();
// Country cell with flag image if available.
const countryCell = row.insertCell();
.style.padding = "8px";
countryCell.style.border = "1px solid #ccc";
countryCell.style.backgroundColor = "#fffff4";
countryCelllet flag = "";
if (d.code) {
= `<img src="https://flagcdn.com/24x18/${d.code}.png" alt="${d.Country} flag"
flag style="vertical-align: middle; margin-right: 5px;">`;
}.innerHTML = `${flag}${d.Country}`;
countryCell
// 2023 cell.
const cell2023 = row.insertCell();
.style.padding = "8px";
cell2023.style.border = "1px solid #ccc";
cell2023.style.backgroundColor = "#e0f7fa";
cell2023.textContent = formatNumber(d["2023"]);
cell2023
// 2024 cell.
const cell2024 = row.insertCell();
.style.padding = "8px";
cell2024.style.border = "1px solid #ccc";
cell2024.style.backgroundColor = "#e8f5e9";
cell2024.textContent = formatNumber(d["2024"]);
cell2024
// Change cell.
const changeCell = row.insertCell();
.style.padding = "8px";
changeCell.style.border = "1px solid #ccc";
changeCell.style.backgroundColor = "#fffff4";
changeCell.style.textAlign = "right";
changeCell.style.color = d.Change >= 0 ? "green" : "red";
changeCell.textContent = d.Change;
changeCell;
})
}
// Initially render the table body.
renderTableBody();
.appendChild(table);
containerreturn container;
}
{// Read the two JSON files.
const data1 = await FileAttachment("data-dol.json").json();
const data2 = await FileAttachment("data-vol.json").json();
// Create sortable tables for each dataset using the reusable function.
const table1 = createSortableTable(data1);
const table2 = createSortableTable(data2);
// Create a container element that displays both tables with a heading.
const container = html`
<div style="font-family: sans-serif;">
<h2>Table 1: Vietnam coffee exports ($US 000s)</h2>
${table1}
<h2>Table 1: Vietnam coffee exports (tons 000s)</h2>
${table2}
</div>
`;
return container;
}