from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly as py
from plotly import tools
from datetime import date
import pandas as pd
import numpy as np
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import seaborn as sns
import random
import warnings
import operator
warnings.filterwarnings("ignore")
init_notebook_mode(connected=True)
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
print ("Summary of Train Dataset: ")
train.describe()
target = train['Target'].value_counts().to_frame()
levels = ["NonVulnerable", "Moderate Poverty", "Vulnerable", "Extereme Poverty"]
trace = go.Bar(y=target.Target, x=levels, marker=dict(color=['green','blue','orange','red'], opacity=0.6))
layout = dict(title="Household Poverty Levels", margin=dict(l=200), width=800, height=400)
data = [trace]
fig = go.Figure(data=data, layout=layout)
iplot(fig)
def compare_plot(col, title):
tr1 = train[train['Target'] == 1][col].value_counts().to_dict()
tr2 = train[train['Target'] == 2][col].value_counts().to_dict()
tr3 = train[train['Target'] == 3][col].value_counts().to_dict()
tr4 = train[train['Target'] == 4][col].value_counts().to_dict()
xx = ['Extereme', 'Moderate', 'Vulnerable', 'NonVulnerable']
trace1 = go.Bar(y=[tr1[0], tr2[0], tr3[0], tr4[0]], name="Not Present", x=xx, marker=dict(color='red', opacity=0.6))
trace2 = go.Bar(y=[tr1[1], tr2[1], tr3[1], tr4[1]], name="Present", x=xx, marker=dict(color='green', opacity=0.6))
return trace1, trace2
tr1, tr2 = compare_plot("v18q", "Tablet")
tr3, tr4 = compare_plot("refrig", "Refrigirator")
tr5, tr6 = compare_plot("computer", "Computer")
tr7, tr8 = compare_plot("television", "Television")
tr9, tr10 = compare_plot("mobilephone", "MobilePhone")
titles = ["Tablet", "Refrigirator", "Computer", "Television", "MobilePhone"]
fig = tools.make_subplots(rows=3, cols=2, print_grid=False, subplot_titles=titles)
fig.append_trace(tr1, 1, 1)
fig.append_trace(tr2, 1, 1)
fig.append_trace(tr3, 1, 2)
fig.append_trace(tr4, 1, 2)
fig.append_trace(tr5, 2, 1)
fig.append_trace(tr6, 2, 1)
fig.append_trace(tr7, 2, 2)
fig.append_trace(tr8, 2, 2)
fig.append_trace(tr9, 3, 1)
fig.append_trace(tr10, 3, 1)
fig['layout'].update(height=1000, title="What do Households Own", barmode="stack", showlegend=False)
iplot(fig)
def find_prominent(row, mats):
for c in mats:
if row[c] == 1:
return c
return
def combine(starter, colname, title, replacemap):
mats = [c for c in train.columns if c.startswith(starter)]
train[colname] = train.apply(lambda row : find_prominent(row, mats), axis=1)
train[colname] = train[colname].apply(lambda x : replacemap[x] if x != None else x )
om1 = train[train['Target'] == 1][colname].value_counts().to_frame()
om2 = train[train['Target'] == 2][colname].value_counts().to_frame()
om3 = train[train['Target'] == 3][colname].value_counts().to_frame()
om4 = train[train['Target'] == 4][colname].value_counts().to_frame()
trace1 = go.Bar(y=om1[colname], x=om1.index, name="Extereme", marker=dict(color='red', opacity=0.9))
trace2 = go.Bar(y=om2[colname], x=om2.index, name="Moderate", marker=dict(color='red', opacity=0.5))
trace3 = go.Bar(y=om3[colname], x=om3.index, name="Vulnerable", marker=dict(color='green', opacity=0.5))
trace4 = go.Bar(y=om4[colname], x=om4.index, name="NonVulnerable", marker=dict(color='green', opacity=0.9))
return [trace1, trace2, trace3, trace4]
titles = ["Outside Wall Material", "Floor Material", "Roof Material", "Sanitary Conditions", "Cooking Energy Sources", "Disposal Methods"]
fig = tools.make_subplots(rows=3, cols=2, print_grid=False, subplot_titles=titles)
### outside material
flr = {'paredblolad' : "Block / Brick", "paredpreb" : "Cement", "paredmad" : "Wood",
"paredzocalo" : "Socket", "pareddes" : "Waste Material", "paredfibras" : "Fibres",
"paredother" : "Other", "paredzinc": "Zink"}
res = combine("pared", "outside_material", "Predominanat Material of the External Walls", flr)
for x in res:
fig.append_trace(x, 1, 1)
### floor material
flr = {'pisomoscer' : "Mosaic / Ceramic", "pisocemento" : "Cement", "pisonatur" : "Natural Material",
"pisonotiene" : "No Floor", "pisomadera" : "Wood", "pisoother" : "Other"}
res = combine("piso", "floor_material", "Floor Material of the Households", flr)
for x in res:
fig.append_trace(x, 1, 2)
### Roof Material
flr = {'techozinc' : "Zinc", "techoentrepiso" : "Fibre / Cement", "techocane" : "Natural Fibre", "techootro" : "Other"}
res = combine("tech", "roof_material", "Roof Material of the Households", flr)
for x in res:
fig.append_trace(x, 2, 1)
### Sanitary Conditions
flr = {'sanitario1' : "No Toilet", "sanitario2" : "Sewer / Cesspool", "sanitario3" : "Septic Tank",
"sanitario5" : "Black Hole", "sanitario6" : "Other System"}
res = combine("sanit", "sanitary", "Sanitary Conditions of the Households", flr)
for x in res:
fig.append_trace(x, 2, 2)
### Energy Source
flr = {'energcocinar1' : "No Kitchen", "energcocinar2" : "Electricity", "energcocinar3" : "Cooking Gas",
"energcocinar4" : "Wood Charcoal"}
res = combine("energ", "energy_source", "Main source of energy for cooking", flr)
for x in res:
fig.append_trace(x, 3, 1)
### Disposal Methods
flr = {"elimbasu1":"Tanker truck",
"elimbasu2": "Buried",
"elimbasu3": "Burning",
"elimbasu4": "Unoccupied space",
"elimbasu5": "River",
"elimbasu6": "Other"}
res = combine("elim", "waste_method", "Rubbish Disposals Method", flr)
for x in res:
fig.append_trace(x, 3, 2)
fig['layout'].update(height=900, title="Key Characteristics of Households", barmode="stack", showlegend=False)
iplot(fig)
def combine2(starter, colname, title, replacemap, plotme = True):
mats = [c for c in train.columns if c.startswith(starter)]
train[colname] = train.apply(lambda row : find_prominent(row, mats), axis=1)
train[colname] = train[colname].apply(lambda x : replacemap[x] if x != None else x )
om1 = train[train['Target'] == 1][colname].value_counts().to_frame()
om2 = train[train['Target'] == 2][colname].value_counts().to_frame()
om3 = train[train['Target'] == 3][colname].value_counts().to_frame()
om4 = train[train['Target'] == 4][colname].value_counts().to_frame()
trace1 = go.Bar(y=om1[colname], x=om1.index, name="Extereme", marker=dict(color='red', opacity=0.9))
trace2 = go.Bar(y=om2[colname], x=om2.index, name="Moderate", marker=dict(color='red', opacity=0.5))
trace3 = go.Bar(y=om3[colname], x=om3.index, name="Vulnerable", marker=dict(color='orange', opacity=0.9))
trace4 = go.Bar(y=om4[colname], x=om4.index, name="NonVulnerable", marker=dict(color='orange', opacity=0.5))
data = [trace1, trace2, trace3, trace4]
layout = dict(title=title, legend=dict(y=1.1, orientation="h"), barmode="stack", margin=dict(l=50), height=400)
fig = go.Figure(data=data, layout=layout)
if plotme:
iplot(fig)
flr = {"instlevel1": "No Education", "instlevel2": "Incomplete Primary", "instlevel3": "Complete Primary",
"instlevel4": "Incomplete Sc.", "instlevel5": "Complete Sc.", "instlevel6": "Incomplete Tech Sc.",
"instlevel7": "Complete Tech Sc.", "instlevel8": "Undergraduation", "instlevel9": "Postgraduation"}
combine2("instl", "education_details", "Education Details of Family Members", flr)
flr = {"estadocivil1": "< 10 years", "estadocivil2": "Free / Coupled union", "estadocivil3": "Married",
"estadocivil4": "Divorced", "estadocivil5": "Separated", "estadocivil6": "Widow",
"estadocivil7": "Single"}
combine2("estado", "status_members", "Status of Family Members", flr)
flr = {"parentesco1": "Household Head", "parentesco2": "Spouse/Partner", "parentesco3": "Son/Daughter",
"parentesco4": "Stepson/Daughter", "parentesco5" : "Son/Daughter in Law" , "parentesco6": "Grandson/Daughter",
"parentesco7": "Mother/Father", "parentesco8": "Mother/Father in Law", "parentesco9" : "Brother/Sister" ,
"parentesco10" : "Brother/Sister in law", "parentesco11" : "Other Family Member", "parentesco12" : "Other Non Family Member"}
combine2("parentesc", "family_members", "Family Members in the Households", flr)
flr = {"lugar1": "Central", "lugar2": "Chorotega", "lugar3": "PacÃÂfico central",
"lugar4": "Brunca", "lugar5": "Huetar Atlántica", "lugar6": "Huetar Norte"}
combine2("lugar", "region", "Region of the Households", flr, plotme=False)