This dataset contains data for 25 years[1995-2015] of flights between various US airports and metadata about these routes. Taken from Bureau of Transportation Statistics, United States Department of Transportation.
Let's see what can we make out of this!
%matplotlib inline
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pass_air_data = pd.read_csv('data/passengers.csv')
In the pass_air_data
dataframe we have the information of number of people that fly every year on a particular route on the list of airlines that fly that route.
pass_air_data.head()
# Create a MultiDiGraph from this dataset
passenger_graph = nx.from_pandas_edgelist(pass_air_data, source='ORIGIN', target='DEST',
edge_attr=['YEAR', 'PASSENGERS', 'UNIQUE_CARRIER_NAME'],
create_using=nx.MultiDiGraph())
passenger_graph['JFK']['SFO'][24]
temp = [(i['YEAR'], i['PASSENGERS'])for i in dict(passenger_graph['JFK']['SFO']).values()]
x, y = zip(*temp)
plt.plot(x, y)
plt.show()
Find the busiest route in 1990 and in 2015 according to number of passengers, and plot the time series of number of passengers on these routes.
You can use the DataFrame instead of working with the network. It will be faster ;) [5 mins]
temp = pass_air_data.groupby(['YEAR'])['PASSENGERS'].transform(max) == pass_air_data['PASSENGERS']
pass_air_data[temp][pass_air_data.YEAR.isin([1990, 2015])]
pass_air_data[(pass_air_data['ORIGIN'] == 'LAX') & (pass_air_data['DEST'] == 'HNL')].plot('YEAR', 'PASSENGERS')
pass_air_data[(pass_air_data['ORIGIN'] == 'LAX') & (pass_air_data['DEST'] == 'SFO')].plot('YEAR', 'PASSENGERS')
# Extract year graph from passenger_graph
def year_network(G, year):
temp_g = nx.DiGraph()
for i in G.edges(data=True):
if i[2]['YEAR'] == year:
temp_g.add_edge(i[0], i[1], weight=i[2]['PASSENGERS'])
return temp_g
pass_2015 = year_network(passenger_graph, 2015)
len(pass_2015)
len(pass_2015.edges())
# Load in the GPS coordinates of all the airports
lat_long = pd.read_csv('data/GlobalAirportDatabase.txt', delimiter=':', header=None)
lat_long[lat_long[1].isin(list(pass_2015.nodes()))]
pos_dict = {}
for airport in lat_long[lat_long[1].isin(list(pass_2015.nodes()))].iterrows():
pos_dict[airport[1][1]] = (airport[1][15], airport[1][14])
pos_dict
Using the position dictionary pos_dict
create a plot of the airports, only the nodes not the edges.
nx.subgraph(Graph, iterable of nodes)
to create the subgraphnx.draw_networkx_nodes(G, pos)
to map the nodes. or
plt.figure(figsize=(20, 9))
G = nx.subgraph(pass_2015, pos_dict.keys())
nx.draw_networkx_nodes(G, pos=pos_dict, node_size=10, alpha=0.6, node_color='b')
# nx.draw_networkx_edges(G, pos=pos_dict, width=0.1, arrows=False)
plt.show()
plt.figure(figsize=(20, 9))
x = [i[0] for i in pos_dict.values()]
y = [i[1] for i in pos_dict.values()]
plt.scatter(x, y)
plt.hist(list(nx.degree_centrality(pass_2015).values()))
plt.show()
Let's plot a log log plot to get a better overview of this.
d = {}
for i, j in dict(nx.degree(pass_2015)).items():
if j in d:
d[j] += 1
else:
d[j] = 1
x = np.log2(list((d.keys())))
y = np.log2(list(d.values()))
plt.scatter(x, y, alpha=0.4)
plt.show()
G = nx.DiGraph()
G.add_edge(1, 2, weight=1)
# print(G.edges())
# G[1][2]
# G[2][1]
# G.is_directed()
# type(G)
G.add_edges_from([(1, 2), (3, 2), (4, 2), (5, 2), (6, 2), (7, 2)])
nx.draw_circular(G, with_labels=True)
G.in_degree()
nx.pagerank(G)
G.add_edge(5, 6)
nx.draw_circular(G, with_labels=True)
nx.pagerank(G)
G.add_edge(2, 8)
nx.draw_circular(G, with_labels=True)
nx.pagerank(G)
sorted(nx.pagerank(pass_2015, weight=None).items(), key=lambda x:x[1], reverse=True)[:10]
sorted(nx.betweenness_centrality(pass_2015).items(), key=lambda x:x[1], reverse=True)[0:10]
sorted(nx.degree_centrality(pass_2015).items(), key=lambda x:x[1], reverse=True)[0:10]
'ANC' is the airport code of Anchorage airport, a place in Alaska, and according to pagerank and betweenness centrality it is the most important airport in this network Isn't that weird? Thoughts?
related blog post: https://toreopsahl.com/2011/08/12/why-anchorage-is-not-that-important-binary-ties-and-sample-selection/
Let's look at weighted version, i.e taking into account the number of people flying to these places.
sorted(nx.betweenness_centrality(pass_2015, weight='weight').items(), key=lambda x:x[1], reverse=True)[0:10]
sorted(nx.pagerank(pass_2015, weight='weight').items(), key=lambda x:x[1], reverse=True)[0:10]
We calculate the average shortest path length of this network, it gives us an idea about the number of jumps we need to make around the network to go from one airport to any other airport in this network.
nx.average_shortest_path_length(pass_2015)
Wait, What??? This network is not connected. That seems like a really stupid thing to do.
list(nx.weakly_connected_components(pass_2015))
pass_air_data[(pass_air_data['YEAR'] == 2015) & (pass_air_data['ORIGIN'] == 'AIK')]
pass_2015.remove_nodes_from(['SPB', 'SSB', 'AIK'])
nx.is_weakly_connected(pass_2015)
nx.is_strongly_connected(pass_2015)
G = nx.DiGraph()
G.add_edge(1, 2)
G.add_edge(2, 3)
G.add_edge(3, 1)
nx.draw(G)
G.add_edge(3, 4)
nx.draw(G)
nx.is_strongly_connected(G)
list(nx.strongly_connected_components(pass_2015))
pass_air_data[(pass_air_data['YEAR'] == 2015) & (pass_air_data['DEST'] == 'TSP')]
pass_2015_strong = max(nx.strongly_connected_component_subgraphs(pass_2015), key=len)
len(pass_2015_strong)
nx.average_shortest_path_length(pass_2015_strong)
How can we decrease the avg shortest path length of this network?
Think of an effective way to add new edges to decrease the avg shortest path length. Let's see if we can come up with a nice way to do this, and the one who gets the highest decrease wins!!!
The rules are simple:
[10 mins]
sort_degree = sorted(nx.degree_centrality(pass_2015_strong).items(), key=lambda x:x[1], reverse=True)
top_count = 0
for n, v in sort_degree:
count = 0
for node, val in sort_degree:
if node != n:
if node not in pass_2015_strong.adj[n]:
pass_2015_strong.add_edge(n, node)
count += 1
if count == 25:
break
top_count += 1
if top_count == 20:
break
nx.average_shortest_path_length(pass_2015_strong)
passenger_graph['JFK']['SFO'][25]
def str_to_list(a):
return a[1:-1].split(', ')
for i in str_to_list(passenger_graph['JFK']['SFO'][25]['UNIQUE_CARRIER_NAME']):
print(i)
%%time
for origin, dest in passenger_graph.edges():
for key in passenger_graph[origin][dest]:
passenger_graph[origin][dest][key]['airlines'] = str_to_list(passenger_graph[origin][dest][key]['UNIQUE_CARRIER_NAME'])
Play around with United Airlines network.
passenger_graph
for the year 2015united_network = nx.DiGraph()
for origin, dest in passenger_graph.edges():
if 25 in passenger_graph[origin][dest]:
if "'United Air Lines Inc.'" in passenger_graph[origin][dest][25]['airlines']:
united_network.add_edge(origin, dest, weight=passenger_graph[origin][dest][25]['PASSENGERS'])
len(united_network)
len(united_network.edges())
sorted(nx.pagerank(united_network, weight='weight').items(), key=lambda x:x[1], reverse=True)[0:10]
sorted(nx.degree_centrality(united_network).items(), key=lambda x:x[1], reverse=True)[0:10]
We are in New York so what should we do?
Obviously we will make a time series of number of passengers flying out of EWR/JFK/LGA with United Airlines over the years.
There are 2 ways of doing it.
OR
pass_air_data
dataframe.pass_air_data[(pass_air_data.ORIGIN == 'EWR') &
(pass_air_data.UNIQUE_CARRIER_NAME.str.contains('United Air Lines Inc.'))
].groupby('YEAR')['PASSENGERS'].sum().plot()
pass_air_data[(pass_air_data.ORIGIN == 'JFK') &
(pass_air_data.UNIQUE_CARRIER_NAME.str.contains('United Air Lines Inc.'))
].groupby('YEAR')['PASSENGERS'].sum().plot()
pass_air_data[(pass_air_data.ORIGIN == 'LGA') &
(pass_air_data.UNIQUE_CARRIER_NAME.str.contains('United Air Lines Inc.'))
].groupby('YEAR')['PASSENGERS'].sum().plot()