#!/usr/bin/env python
# coding: utf-8

# # Step - 3: Legacy of Slavery Certificate of Freedom Case Study - Data Visualization

# In[19]:


# import important libraries used for visualization purposes
import pandas as pd
import networkx
import numpy as np
import geopandas as gpd
import shapely as shp
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
# import cufflinks
import plotly
# word cloud library
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
# # Using plotly + cufflinks in offline mode
# import cufflinks
# cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

# matplotlib library
import matplotlib.pyplot as plt

# import bokeh library which is a famous one for network analysis
from bokeh.io import output_notebook, show, save

# these are needed for Network Visualization below
from bokeh.io import output_notebook, show, save
from bokeh.models import Range1d, Circle, ColumnDataSource, MultiLine
from bokeh.plotting import figure
from bokeh.plotting import from_networkx

# these are needed for Geo Map visualization below
import plotly.figure_factory as ff
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)    


# In[20]:


#reimport the csv saved from the previous step 2
#code to import the csv saved from the previous step
df = pd.read_csv("Datasets/LoS_Clean_Output_Mod2.csv") 
df.head(10)


# In[29]:


# Below cufflinks package is to handle temp object which is a pandas.series dataframe 'df' created above which does not have a iplot method when not linked to plotly. 
# We need cufflinks to link plotly to pandas and add the iplot method:
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)


# In[30]:


# Simple Histogram using iplot
df['AgeFormatted'].iplot(kind='hist', title='Histogram Chart of Enslaved Age vs Count')


# In[31]:


# Simple Histogram using iplot
df['Height_Inches'].iplot(kind='hist', title='Histogram Chart of Enslaved Height vs Count')


# In[32]:


# Below chart is a pie chart for Sex
fig = px.pie(df, names='Sex',color_discrete_sequence=px.colors.sequential.RdBu,title="Pie Chart of Distribution of CoF over Sex")
fig.show()


# In[33]:


# Using the Plotly Express (Doc here -- https://plotly.com/python/plotly-express/), plotting a scatter plot showing Age on the y-axis with County on the x-axis.
fig = px.scatter(df[df["County"].notna()&df["AgeFormatted"].notna()], x="County", y="AgeFormatted", color="County", hover_name="Freed_FirstName",
                 title="County Vs Sex")

fig.show()


# In[34]:


# Creating new dataframe below with functions like datetime, grouper, agg, etc to create new fields from the original data for plotting charts as a grouped by
# or to plot against the counts of the number of CoF's issued by Sex (for example as shown below) 
dfp=pd.DataFrame()
dfp['CoFDate'] = pd.to_datetime(df['DateFormatted'])
dfp['Sex']=df['Sex']
freq='Y'
# The groupby function will aggregate the count of CoF's issued by Sex
dfp = dfp.groupby(['Sex', pd.Grouper(key='CoFDate', freq=freq)])['Sex'].agg(['count']).reset_index()
print(dfp)
# return a sorted DataFrame by date then count
dfp = dfp.sort_values(by=['CoFDate', 'count'])
# if you want to reset the index
dfp = dfp.reset_index(drop=True)


# # Interactive Line Chart Visualization of Sex of the Enslaved vs Year of the issue of CoF Document vs Counts

# In[35]:


# Plot the chart from the data created above
fig = go.Figure()
fig = px.area(dfp, x='CoFDate', y='count', color='Sex',title="CoF Issued Date vs Sex vs Counts")
fig.show()


# ![MD_Historical_Context](Pics/MD_Hist_Imp.PNG "MD_Historical_Context")
# 
# The visualization above where there seems to be a spike in the issue of Certificates of Freedom around 1832 matches with historical events believed to have happened around the same period in MD state.

# ## Interactive Network Visualization and Analysis for the Enslaved and Owner (an example)

# In[36]:


# this needs to be run for Bokeh library to be used
output_notebook()


# In[37]:


# The code below is a sample slice of the original dataset to isolate records belonging to a Slave Owner whose last name is 'Atwell'. This sliced data is then used to show the 
# network graph of the enslaved people owned by this single owner shown at the centre of the networking chart below:
LoS_CoF_df = df.loc[(df["Owner_LastName"]=='Atwell')]
LoS_CoF = networkx.from_pandas_edgelist(LoS_CoF_df,'Owner_LastName','Freed_FirstName','DataItem')


# In[38]:


# Below are the steps required to plot a networking graph between the single owner with last name 'Atwell' from the CoF collection and the Enslaved people owned by this person.
plt.figure(figsize=(8,8))
networkx.draw(LoS_CoF, with_labels=True, node_color='skyblue', width=.3, font_size=8)
#Choose a title!
title = 'Legacy Of Slavery Certificates of Freedom - Enslaved Last Name vs Owner Last Name'

#Establish which categories will appear when hovering over each node
HOVER_TOOLTIPS = [("Freed_FirstName", "@index")]

#Create a plot — set dimensions, toolbar, and title
plot = figure(tooltips = HOVER_TOOLTIPS,
              tools="pan,wheel_zoom,save,reset", active_scroll='wheel_zoom',
            x_range=Range1d(-10.1, 10.1), y_range=Range1d(-10.1, 10.1), title=title)

#Create a network graph object with spring layout
# https://networkx.github.io/documentation/networkx-1.9/reference/generated/networkx.drawing.layout.spring_layout.html
network_graph = from_networkx(LoS_CoF, networkx.spring_layout, scale=10, center=(0, 0))

#Set node size and color
network_graph.node_renderer.glyph = Circle(size=15, fill_color='skyblue')

#Set edge opacity and width
network_graph.edge_renderer.glyph = MultiLine(line_alpha=0.5, line_width=1)

#Add network graph to the plot
plot.renderers.append(network_graph)

show(plot)
#save(plot, filename=f"{title}.html")


# # Interactive Geo Map Visualization of Maryland Counties from the Dataset (still work to be done)

# In[39]:


# Below is an important transformation step where a new dataframe from the original data is created to map County Codes from the original CoF dataset to commonly used numeric code 
# called 'fips' code, which is used in geomapping services to automatically locate the area on the geo map. For example, a fips code of 24003 is assigned to CoF entries with County='AA' that 
# maps to 'Anne Arundel' county in MD based on MSA's classification here -- https://msa.maryland.gov/msa/speccol/sc2600/sc2685/html/abbrev.html
dfg = pd.DataFrame()
dfg= df
dfg.loc[(df["County"]=='AA'), "County_Code"] = "24003"
dfg.loc[(df["County"]=='AA'), "County_New"] ="Anne Arundel"
dfg.loc[(df["County"]=='AL'), "County_Code"] ="24001"
dfg.loc[(df["County"]=='AL'), "County_New"] ="Allegany"
dfg.loc[(df["County"]=='BA'), "County_Code"] ="24510"
dfg.loc[(df["County"]=='BA'), "County_New"] ="Baltimore County"
dfg.loc[(df["County"]=='BC'), "County_Code"] ="24005"
dfg.loc[(df["County"]=='BC'), "County_New"] ="Baltimore City"
dfg.loc[(df["County"]=='CA'), "County_Code"] ="24011"
dfg.loc[(df["County"]=='CA'), "County_New"] ="Caroline"
dfg.loc[(df["County"]=='CE'), "County_Code"] ="24015"
dfg.loc[(df["County"]=='CE'), "County_New"] ="Cecil"
dfg.loc[(df["County"]=='CH'), "County_Code"] ="24017"
dfg.loc[(df["County"]=='CH'), "County_New"] ="Charles"
dfg.loc[(df["County"]=='CR'), "County_Code"] ="24013"
dfg.loc[(df["County"]=='CR'), "County_New"] ="Carroll"
dfg.loc[(df["County"]=='CV'), "County_Code"] ="24009"
dfg.loc[(df["County"]=='CV'), "County_New"] ="Calvert"
dfg.loc[(df["County"]=='DO'), "County_Code"] ="24019"
dfg.loc[(df["County"]=='DO'), "County_New"] ="Dorchester"
dfg.loc[(df["County"]=='FR'), "County_Code"] ="24021"
dfg.loc[(df["County"]=='FR'), "County_New"] ="Frederick"
dfg.loc[(df["County"]=='GA'), "County_Code"] ="24023"
dfg.loc[(df["County"]=='GA'), "County_New"] ="Garrett"
dfg.loc[(df["County"]=='HA'), "County_Code"] ="24025"
dfg.loc[(df["County"]=='HA'), "County_New"] ="Harford"
dfg.loc[(df["County"]=='HO'), "County_Code"] ="24027"
dfg.loc[(df["County"]=='HO'), "County_New"] ="Howard"
dfg.loc[(df["County"]=='KE'), "County_Code"] ="24029"
dfg.loc[(df["County"]=='KE'), "County_New"] ="Kent"
dfg.loc[(df["County"]=='MO'), "County_Code"] ="24031"
dfg.loc[(df["County"]=='MO'), "County_New"] ="Montgomery"
dfg.loc[(df["County"]=='PG'), "County_Code"] ="24033"
dfg.loc[(df["County"]=='PG'), "County_New"] ="Prince George's"
dfg.loc[(df["County"]=='Qa'), "County_Code"] ="24035"
dfg.loc[(df["County"]=='Qa'), "County_New"] ="Queen Anne's"
dfg.loc[(df["County"]=='QA'), "County_Code"] ="24035"
dfg.loc[(df["County"]=='QA'), "County_New"] ="Queen Anne's"
dfg.loc[(df["County"]=='SM'), "County_Code"] ="24037"
dfg.loc[(df["County"]=='SM'), "County_New"] ="St. Mary's"
dfg.loc[(df["County"]=='SO'), "County_Code"] ="24039"
dfg.loc[(df["County"]=='SO'), "County_New"] ="Somerset"
dfg.loc[(df["County"]=='TA'), "County_Code"] ="24041"
dfg.loc[(df["County"]=='TA'), "County_New"] ="Talbot"
dfg.loc[(df["County"]=='WA'), "County_Code"] ="24043"
dfg.loc[(df["County"]=='WA'), "County_New"] ="Washington"
dfg.loc[(df["County"]=='WI'), "County_Code"] ="24045"
dfg.loc[(df["County"]=='WI'), "County_New"] ="Wicomico"
dfg.loc[(df["County"]=='WO'), "County_Code"] ="24047"
dfg.loc[(df["County"]=='WO'), "County_New"] ="Worcester"


# In[40]:


# A similar groupby and aggregate function is run to consolidate the counts of CoF's issued across each county for use in Geo map visualization below
values=pd.DataFrame()
values=dfg.groupby('County_Code').agg('count')
values['fips']=values.index
values1=dfg.groupby('County_New').agg('count')
values['county_name']=values1.index


# In[41]:


# Plotly Mapbox tool is used to create a geo map with the fips and county counts of CoF's from previous steps with different color ranges, and an interactive map showing
# MD state and the counties. The map automatically zooms to the MD state which is done using the zoom parameter and by adjusting the latitude and longitude values below. 
fig = px.choropleth_mapbox(values, geojson=counties, locations='fips', color='County',
                           title='# of CoFs issued in MD State by Counties',
                           color_continuous_scale="Edge",
                           range_color=(10, 5000),
                           mapbox_style="carto-positron",
                           hover_name='county_name',
                           zoom=6, center = {"lat": 39.0458, "lon": -76.641273},
                           opacity=0.5,
                           hover_data=['county_name'],
                           labels={'County':'# Counts of CoF','county_name':'County Name'}
                          )
fig.update_layout(title='# of CoFs issued in MD State by Counties')
fig.show()


# In[42]:


# This is another geo map visualization implemented using another Python plotting package called as Figure Factory. This is not as interactive as the Plotly Mapbox.
import shapely
import shapefile
import plotly
from plotly.figure_factory._county_choropleth import create_choropleth
import plotly.figure_factory as ff
fig = ff.create_choropleth(fips=values.index.to_list(), 
                           scope=['Maryland'],
                           values=values.County.to_list(), 
                           title='MD State with Counties', 
                           round_legend_values=True,
                           show_state_data=True,
                           county_outline={'color': 'rgb(255,255,255)', 'width': 0.5},
                           exponent_format=True,
                           legend_title='# Counts of CoF')
fig.layout.template = None
fig.show()


# # Word Cloud Visualization of an important feature (Notes) which are comments/remarks entered by transcribers.

# In[43]:


# Start with the Notes feature:
text = df['Notes']

# Create and generate a word cloud image:
wordcloud = WordCloud().generate(text.to_string())

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


# In[44]:


# Start with the Notes feature:
text = df['Notes']

stopwords = set(STOPWORDS)
stopwords.update(["Anne", "Arundel", "Baltimore", "Arundel County", "Dorchester","County"])
# Create and generate a word cloud image:
wordcloud = WordCloud(stopwords=stopwords).generate(text.to_string())

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


# # End of Modules.
# [Click here to go to index page](index.ipynb)