#!/usr/bin/env python # coding: utf-8 # # Step - 3: Legacy of Slavery Certificate of Freedom Case Study - Data Visualization # In[19]: # import important libraries used for visualization purposes import pandas as pd import networkx import numpy as np import geopandas as gpd import shapely as shp from plotly.offline import init_notebook_mode, iplot init_notebook_mode(connected=True) import plotly.graph_objs as go import plotly.graph_objects as go import plotly.express as px import pandas as pd # import cufflinks import plotly # word cloud library from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator import plotly.graph_objs as go from plotly.offline import iplot, init_notebook_mode # # Using plotly + cufflinks in offline mode # import cufflinks # cufflinks.go_offline(connected=True) init_notebook_mode(connected=True) # matplotlib library import matplotlib.pyplot as plt # import bokeh library which is a famous one for network analysis from bokeh.io import output_notebook, show, save # these are needed for Network Visualization below from bokeh.io import output_notebook, show, save from bokeh.models import Range1d, Circle, ColumnDataSource, MultiLine from bokeh.plotting import figure from bokeh.plotting import from_networkx # these are needed for Geo Map visualization below import plotly.figure_factory as ff from urllib.request import urlopen import json with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response: counties = json.load(response) # In[20]: #reimport the csv saved from the previous step 2 #code to import the csv saved from the previous step df = pd.read_csv("Datasets/LoS_Clean_Output_Mod2.csv") df.head(10) # In[29]: # Below cufflinks package is to handle temp object which is a pandas.series dataframe 'df' created above which does not have a iplot method when not linked to plotly. # We need cufflinks to link plotly to pandas and add the iplot method: import cufflinks as cf cf.go_offline() cf.set_config_file(offline=False, world_readable=True) # In[30]: # Simple Histogram using iplot df['AgeFormatted'].iplot(kind='hist', title='Histogram Chart of Enslaved Age vs Count') # In[31]: # Simple Histogram using iplot df['Height_Inches'].iplot(kind='hist', title='Histogram Chart of Enslaved Height vs Count') # In[32]: # Below chart is a pie chart for Sex fig = px.pie(df, names='Sex',color_discrete_sequence=px.colors.sequential.RdBu,title="Pie Chart of Distribution of CoF over Sex") fig.show() # In[33]: # Using the Plotly Express (Doc here -- https://plotly.com/python/plotly-express/), plotting a scatter plot showing Age on the y-axis with County on the x-axis. fig = px.scatter(df[df["County"].notna()&df["AgeFormatted"].notna()], x="County", y="AgeFormatted", color="County", hover_name="Freed_FirstName", title="County Vs Sex") fig.show() # In[34]: # Creating new dataframe below with functions like datetime, grouper, agg, etc to create new fields from the original data for plotting charts as a grouped by # or to plot against the counts of the number of CoF's issued by Sex (for example as shown below) dfp=pd.DataFrame() dfp['CoFDate'] = pd.to_datetime(df['DateFormatted']) dfp['Sex']=df['Sex'] freq='Y' # The groupby function will aggregate the count of CoF's issued by Sex dfp = dfp.groupby(['Sex', pd.Grouper(key='CoFDate', freq=freq)])['Sex'].agg(['count']).reset_index() print(dfp) # return a sorted DataFrame by date then count dfp = dfp.sort_values(by=['CoFDate', 'count']) # if you want to reset the index dfp = dfp.reset_index(drop=True) # # Interactive Line Chart Visualization of Sex of the Enslaved vs Year of the issue of CoF Document vs Counts # In[35]: # Plot the chart from the data created above fig = go.Figure() fig = px.area(dfp, x='CoFDate', y='count', color='Sex',title="CoF Issued Date vs Sex vs Counts") fig.show() # ![MD_Historical_Context](Pics/MD_Hist_Imp.PNG "MD_Historical_Context") # # The visualization above where there seems to be a spike in the issue of Certificates of Freedom around 1832 matches with historical events believed to have happened around the same period in MD state. # ## Interactive Network Visualization and Analysis for the Enslaved and Owner (an example) # In[36]: # this needs to be run for Bokeh library to be used output_notebook() # In[37]: # The code below is a sample slice of the original dataset to isolate records belonging to a Slave Owner whose last name is 'Atwell'. This sliced data is then used to show the # network graph of the enslaved people owned by this single owner shown at the centre of the networking chart below: LoS_CoF_df = df.loc[(df["Owner_LastName"]=='Atwell')] LoS_CoF = networkx.from_pandas_edgelist(LoS_CoF_df,'Owner_LastName','Freed_FirstName','DataItem') # In[38]: # Below are the steps required to plot a networking graph between the single owner with last name 'Atwell' from the CoF collection and the Enslaved people owned by this person. plt.figure(figsize=(8,8)) networkx.draw(LoS_CoF, with_labels=True, node_color='skyblue', width=.3, font_size=8) #Choose a title! title = 'Legacy Of Slavery Certificates of Freedom - Enslaved Last Name vs Owner Last Name' #Establish which categories will appear when hovering over each node HOVER_TOOLTIPS = [("Freed_FirstName", "@index")] #Create a plot — set dimensions, toolbar, and title plot = figure(tooltips = HOVER_TOOLTIPS, tools="pan,wheel_zoom,save,reset", active_scroll='wheel_zoom', x_range=Range1d(-10.1, 10.1), y_range=Range1d(-10.1, 10.1), title=title) #Create a network graph object with spring layout # https://networkx.github.io/documentation/networkx-1.9/reference/generated/networkx.drawing.layout.spring_layout.html network_graph = from_networkx(LoS_CoF, networkx.spring_layout, scale=10, center=(0, 0)) #Set node size and color network_graph.node_renderer.glyph = Circle(size=15, fill_color='skyblue') #Set edge opacity and width network_graph.edge_renderer.glyph = MultiLine(line_alpha=0.5, line_width=1) #Add network graph to the plot plot.renderers.append(network_graph) show(plot) #save(plot, filename=f"{title}.html") # # Interactive Geo Map Visualization of Maryland Counties from the Dataset (still work to be done) # In[39]: # Below is an important transformation step where a new dataframe from the original data is created to map County Codes from the original CoF dataset to commonly used numeric code # called 'fips' code, which is used in geomapping services to automatically locate the area on the geo map. For example, a fips code of 24003 is assigned to CoF entries with County='AA' that # maps to 'Anne Arundel' county in MD based on MSA's classification here -- https://msa.maryland.gov/msa/speccol/sc2600/sc2685/html/abbrev.html dfg = pd.DataFrame() dfg= df dfg.loc[(df["County"]=='AA'), "County_Code"] = "24003" dfg.loc[(df["County"]=='AA'), "County_New"] ="Anne Arundel" dfg.loc[(df["County"]=='AL'), "County_Code"] ="24001" dfg.loc[(df["County"]=='AL'), "County_New"] ="Allegany" dfg.loc[(df["County"]=='BA'), "County_Code"] ="24510" dfg.loc[(df["County"]=='BA'), "County_New"] ="Baltimore County" dfg.loc[(df["County"]=='BC'), "County_Code"] ="24005" dfg.loc[(df["County"]=='BC'), "County_New"] ="Baltimore City" dfg.loc[(df["County"]=='CA'), "County_Code"] ="24011" dfg.loc[(df["County"]=='CA'), "County_New"] ="Caroline" dfg.loc[(df["County"]=='CE'), "County_Code"] ="24015" dfg.loc[(df["County"]=='CE'), "County_New"] ="Cecil" dfg.loc[(df["County"]=='CH'), "County_Code"] ="24017" dfg.loc[(df["County"]=='CH'), "County_New"] ="Charles" dfg.loc[(df["County"]=='CR'), "County_Code"] ="24013" dfg.loc[(df["County"]=='CR'), "County_New"] ="Carroll" dfg.loc[(df["County"]=='CV'), "County_Code"] ="24009" dfg.loc[(df["County"]=='CV'), "County_New"] ="Calvert" dfg.loc[(df["County"]=='DO'), "County_Code"] ="24019" dfg.loc[(df["County"]=='DO'), "County_New"] ="Dorchester" dfg.loc[(df["County"]=='FR'), "County_Code"] ="24021" dfg.loc[(df["County"]=='FR'), "County_New"] ="Frederick" dfg.loc[(df["County"]=='GA'), "County_Code"] ="24023" dfg.loc[(df["County"]=='GA'), "County_New"] ="Garrett" dfg.loc[(df["County"]=='HA'), "County_Code"] ="24025" dfg.loc[(df["County"]=='HA'), "County_New"] ="Harford" dfg.loc[(df["County"]=='HO'), "County_Code"] ="24027" dfg.loc[(df["County"]=='HO'), "County_New"] ="Howard" dfg.loc[(df["County"]=='KE'), "County_Code"] ="24029" dfg.loc[(df["County"]=='KE'), "County_New"] ="Kent" dfg.loc[(df["County"]=='MO'), "County_Code"] ="24031" dfg.loc[(df["County"]=='MO'), "County_New"] ="Montgomery" dfg.loc[(df["County"]=='PG'), "County_Code"] ="24033" dfg.loc[(df["County"]=='PG'), "County_New"] ="Prince George's" dfg.loc[(df["County"]=='Qa'), "County_Code"] ="24035" dfg.loc[(df["County"]=='Qa'), "County_New"] ="Queen Anne's" dfg.loc[(df["County"]=='QA'), "County_Code"] ="24035" dfg.loc[(df["County"]=='QA'), "County_New"] ="Queen Anne's" dfg.loc[(df["County"]=='SM'), "County_Code"] ="24037" dfg.loc[(df["County"]=='SM'), "County_New"] ="St. Mary's" dfg.loc[(df["County"]=='SO'), "County_Code"] ="24039" dfg.loc[(df["County"]=='SO'), "County_New"] ="Somerset" dfg.loc[(df["County"]=='TA'), "County_Code"] ="24041" dfg.loc[(df["County"]=='TA'), "County_New"] ="Talbot" dfg.loc[(df["County"]=='WA'), "County_Code"] ="24043" dfg.loc[(df["County"]=='WA'), "County_New"] ="Washington" dfg.loc[(df["County"]=='WI'), "County_Code"] ="24045" dfg.loc[(df["County"]=='WI'), "County_New"] ="Wicomico" dfg.loc[(df["County"]=='WO'), "County_Code"] ="24047" dfg.loc[(df["County"]=='WO'), "County_New"] ="Worcester" # In[40]: # A similar groupby and aggregate function is run to consolidate the counts of CoF's issued across each county for use in Geo map visualization below values=pd.DataFrame() values=dfg.groupby('County_Code').agg('count') values['fips']=values.index values1=dfg.groupby('County_New').agg('count') values['county_name']=values1.index # In[41]: # Plotly Mapbox tool is used to create a geo map with the fips and county counts of CoF's from previous steps with different color ranges, and an interactive map showing # MD state and the counties. The map automatically zooms to the MD state which is done using the zoom parameter and by adjusting the latitude and longitude values below. fig = px.choropleth_mapbox(values, geojson=counties, locations='fips', color='County', title='# of CoFs issued in MD State by Counties', color_continuous_scale="Edge", range_color=(10, 5000), mapbox_style="carto-positron", hover_name='county_name', zoom=6, center = {"lat": 39.0458, "lon": -76.641273}, opacity=0.5, hover_data=['county_name'], labels={'County':'# Counts of CoF','county_name':'County Name'} ) fig.update_layout(title='# of CoFs issued in MD State by Counties') fig.show() # In[42]: # This is another geo map visualization implemented using another Python plotting package called as Figure Factory. This is not as interactive as the Plotly Mapbox. import shapely import shapefile import plotly from plotly.figure_factory._county_choropleth import create_choropleth import plotly.figure_factory as ff fig = ff.create_choropleth(fips=values.index.to_list(), scope=['Maryland'], values=values.County.to_list(), title='MD State with Counties', round_legend_values=True, show_state_data=True, county_outline={'color': 'rgb(255,255,255)', 'width': 0.5}, exponent_format=True, legend_title='# Counts of CoF') fig.layout.template = None fig.show() # # Word Cloud Visualization of an important feature (Notes) which are comments/remarks entered by transcribers. # In[43]: # Start with the Notes feature: text = df['Notes'] # Create and generate a word cloud image: wordcloud = WordCloud().generate(text.to_string()) # Display the generated image: plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show() # In[44]: # Start with the Notes feature: text = df['Notes'] stopwords = set(STOPWORDS) stopwords.update(["Anne", "Arundel", "Baltimore", "Arundel County", "Dorchester","County"]) # Create and generate a word cloud image: wordcloud = WordCloud(stopwords=stopwords).generate(text.to_string()) # Display the generated image: plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show() # # End of Modules. # [Click here to go to index page](index.ipynb)