from plotly import express as px
import plotly.figure_factory as ff
import numpy as np
import pandas as pd
import sqlite3
= sqlite3.connect("temps.db") # connect to database
conn = conn.cursor()
cursor = pd.read_csv("temps_stacked.csv")
df1 #df1.to_sql("temperatures", conn, index=False)
= pd.read_csv("countries.csv")
df2 #df2.to_sql("countries", conn, index=False)
= pd.read_csv("station-metadata.csv")
df3 #df3.to_sql("stations", conn, index=False)
# close connection conn.close()
What else can Plotly produce? Today, we continue with the NOAA climate data and explore what other visualizations can be made.
1. Import Data
This procedure is the exact same as the last post.
2. Maps and More Maps
In the previous post, we mainly worked with px.scatter_mapbox
.
Let’s see what other maps there are!
1. px.density_mapbox
: Mapbox Density Heatmap
# using the stations data
= px.density_mapbox(df3,
fig ='LATITUDE',
lat='LONGITUDE',
lon=2,
radius=0.8,
zoom= 500)
height
="carto-positron")
fig.update_layout(mapbox_style={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(margin fig.show()
Woah! The above produces a heatmap comparing the density of stations. The brighter color (like that of the US) represents that there are more stations clustered in that region. We can play around with the various arguments to get different results.
For example, the following plot will focus on Europe:
= px.density_mapbox(df3,
fig ='LATITUDE',
lat='LONGITUDE',
lon=3.5,
radius=2.5,
zoom= 500,
height ={"lat" : 50, "lon" : 10})
center
="carto-positron")
fig.update_layout(mapbox_style={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(margin fig.show()
2. px.scatter_geo
: Bubble Map
Now, suppose we wanted to know the exact number of stations in each country. We first write a simple function to grab the data we need.
def station_count():
"""
Function to count the number of stations in each country
"""
= sqlite3.connect("temps.db")
conn = f"SELECT C.Name Country, COUNT(S.Name) Count \
cmd FROM stations S \
LEFT JOIN countries C on SUBSTRING(S.id, 1, 2) = C.'FIPS 10-4' GROUP BY Country"
= pd.read_sql_query(cmd, conn)
df
conn.close()return df
= station_count()
count_df count_df
Country | Count | |
---|---|---|
0 | None | 19 |
1 | Afghanistan | 6 |
2 | Albania | 7 |
3 | Algeria | 100 |
4 | American Samoa | 2 |
... | ... | ... |
232 | Wallis and Futuna | 4 |
233 | Western Sahara | 3 |
234 | Yemen | 4 |
235 | Zambia | 19 |
236 | Zimbabwe | 22 |
237 rows × 2 columns
Now that we have the data, let’s get to plotting our bubble map.
= px.scatter_geo(count_df, locations="Country", locationmode="country names",
fig ="Country", size="Count",
hover_name="natural earth", height = 300)
projection={"r":0,"t":0,"l":0,"b":25})
fig.update_layout(margin fig.show()
With the above, we definitely get the information that we wanted, but in terms of visualization, it’s not the best. Most bubbles are barely visible due to their small station count. How can we fix this?
3. px.choropleth
: Choropleth Map
We will essentially run the same code but using px.choropleth
.
= px.choropleth(count_df, locations="Country", locationmode="country names",
fig ="Country", color="Count", height = 300)
hover_name={"r":0,"t":0,"l":0,"b":25})
fig.update_layout(margin fig.show()
Slightly better, but still not great. To fix this issue, we will use one of the most common tricks used in data transformation: taking the logarithm of the response values.
"Transformed_Count"] = np.log(count_df["Count"])
count_df[= px.choropleth(count_df, locations="Country", locationmode="country names",
fig ="Transformed_Count", hover_name="Country",
color="Viridis")
color_continuous_scale={"r":0,"t":0,"l":0,"b":25})
fig.update_layout(margin fig.show()
This is much better. Now that we are working off of an exponential scale, the distinctions are much more clear. To get the station count number, we can simply compute \(e^x\), where \(x\) is the transformed count for that specific country.
4. ff.create_hexbin_mapbox
: Hexbin Mapbox
We return to our old friend Mapbox, and try to replicate the visualization in Part 1 but with a twist.
= df3.drop(1752) # drop an observation that will cause an error
df3_fix
= ff.create_hexbin_mapbox(
fig =df3_fix, lat="LATITUDE", lon="LONGITUDE",
data_frame=10, opacity=0.6, labels={"color": "Point Count"},
nx_hexagon= 0.6, height=500
zoom
)={"r":0,"t":0,"l":0,"b":25})
fig.update_layout(margin="carto-positron")
fig.update_layout(mapbox_style fig.show()
The above is similar to a density heatmap, but it is organized in hexagonal regions. We can play around with the arguments to get a cleaner plot:
= ff.create_hexbin_mapbox(
fig =df3_fix, lat="LATITUDE", lon="LONGITUDE",
data_frame=25, opacity=0.3, labels={"color": "Point Count"},
nx_hexagon=1, zoom=0.6, height=500
min_count
)={"r":0,"t":0,"l":0,"b":25})
fig.update_layout(margin="carto-positron")
fig.update_layout(mapbox_style fig.show()
Nice! The above only shows regions with at least 1 station. Now, suppose a user wanted to know the mean temperature of each region (based on stations that are in that region) for a specific year and month. How could we do that?
We start with writing a query function query_region_temp()
that grabs the appropriate data:
def query_region_temp(year, month):
"""
Query function to filter data
Args:
year: a specific year
month: a specific month of the year
Return:
a dataframe containing all matches
"""
= sqlite3.connect("temps.db")
conn = f"SELECT S.LATITUDE, S.LONGITUDE, T.Year, T.Month, T.Temp \
cmd FROM stations S \
LEFT JOIN countries C on SUBSTRING(S.id, 1, 2) = C.'FIPS 10-4' \
LEFT JOIN temperatures t on S.id = T.id \
WHERE T.Year = {year} AND T.Month = {month}"
= pd.read_sql_query(cmd, conn)
df
conn.close()return df
We make sure this query function is correct:
2000, 6) query_region_temp(
LATITUDE | LONGITUDE | Year | Month | Temp | |
---|---|---|---|---|---|
0 | 57.7667 | 11.8667 | 2000 | 6 | 14.85 |
1 | 25.3330 | 55.5170 | 2000 | 6 | 33.09 |
2 | 25.6170 | 55.9330 | 2000 | 6 | 33.70 |
3 | 25.2550 | 55.3640 | 2000 | 6 | 32.44 |
4 | 24.4300 | 54.4700 | 2000 | 6 | 33.78 |
... | ... | ... | ... | ... | ... |
14287 | -17.8170 | 25.8170 | 2000 | 6 | 17.55 |
14288 | -14.4500 | 28.4670 | 2000 | 6 | 17.35 |
14289 | -17.9170 | 31.1330 | 2000 | 6 | 13.53 |
14290 | -20.1500 | 28.6170 | 2000 | 6 | 14.10 |
14291 | -20.0670 | 30.8670 | 2000 | 6 | 14.36 |
14292 rows × 5 columns
Great! It seems like the query function is correctly grabbing all temperature data that apply to a specific year and month. All that’s left to do is the create the plotting function, region_avg_plot()
:
def region_avg_plot(year, month, **kwargs):
"""
Create a plot containing average temperature of each hexagonal region
Args:
year: a specific year
month: a specific month of the year
**kwargs: additional keyword arguments
Return:
a hexbin_mapbox plot
"""
# pull the correct data using the query function
= query_region_temp(year = year, month = month)
df
= {
months 1: "January", 2: "February", 3: "March", 4: "April",
5: "May", 6: "June", 7: "July", 8: "August",
9: "September", 10: "October", 11: "November", 12: "December"
}
# error-handling: make sure latitude is not -90
= df['LATITUDE'] != -90
handle = df[handle]
df
# create the plot
= ff.create_hexbin_mapbox(
fig =df, lat="LATITUDE", lon="LONGITUDE",
data_frame=25, opacity=0.3, zoom=0.6, height=500,
nx_hexagon="Temp", agg_func=np.mean, color_continuous_scale="Icefire",
color= "Regional Average Temperature (°C) in " + months.get(month) + ", " + str(year),
title **kwargs
)
= 'Temperature (°C)'
fig.layout.coloraxis.colorbar.title =dict(l=50, r=50, t=100, b=50))
fig.update_layout(margin="carto-positron")
fig.update_layout(mapbox_stylereturn fig
Last but not least, let’s test this function:
2000,6) region_avg_plot(