In [1]:

%matplotlib inline
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
import pickle, os, gc, re

from utility import *

In [2]:

import warnings
warnings.filterwarnings('ignore')

# from IPython.display import display
# from IPython.display import HTML
# import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

# # This line will hide code by default when the notebook is exported as HTML
# di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

# # This line will add a button to toggle visibility of code blocks, for use with the HTML export version
# di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)

In [47]:

output_notebook()

Loading BokehJS ...

In [3]:

# <!-- collapse=True -->
BES_data_folder = '../BES_analysis_data/'
PublicWhip_folder = create_subdir(BES_data_folder,"PublicWhip")
directory = os.fsencode(PublicWhip_folder)

In [ ]:

In [4]:

replace_dict = {
     -9:'missing',
      1:'tellaye',
      2:'aye',
      3:'both',
      4:'no',
      5:'tellno'}

In [5]:

# https://www.publicwhip.org.uk/data/ideas-marginal.txt
    
#     Ideas of things that could be done with Public Whip data, or similar.
# I don't intend to do any of them, as they don't fit my goals - but they
# might give you useful suggestions.  There are probably a few political
# science papers in here.

# Some similar things done with US voting data:
# http://stat.columbia.edu/~jakulin/Politics/index.htm

# Party politics
# --------------

# Table of most rebellious by party - so see most rebellious Con and LDem.
# At the moment only Labour show up.
# Table of highest attendance by party - similarly.
# Rankings on rebellion/attendance PER PARTY

# Compare attendance rates between parties, as they go in and out of
# government over the years  (use raw data "Extra Turnout" uses)
# Compare rebellion rates between parties

# Find out must rebellious divisions by party
#     e.g. find out what issues are fracturing the Tories

# Statistics by party - e.g. attendence rate. 
# How rebellious a party it is. etc.

# Proportion of abstentions within each party, may reveal times when MPs abstain in protest.  

# % of completely pliant MPs in each party, who always follow whip.

# Do Co-op members ever vote against Labour as a group?

# Academic analyses (of no short term practical political benefit)
# -----------------

# Graph of how many MPs defect in each vote over time
# Watch for loyalty going down after start of term, to lowest at midterm,
# then up again

# Analyse if MPs who are "sir" vote differently in anyway
#     first check data integrity that title always has "Sir" for knights
# Get data on gender etc. and analyse against that

# Regional analysis.  Scotland, NI, Wales, North v South.  Urban v.  Rural.  
# Area of land for constituency.  This gives a "ruralness" measure.
# Population of constituency.
# Distance of constituency from London vs. attendance rate

# Integrate parliamentary majority, and look for correlations with
# rebelliousness?  Majorities here:
#     http://www.psr.keele.ac.uk/area/uk/mps.htm
# (Should be no correlation, as reselection more important?)
# Plot majority as a colour on the cluster diagram
# dONE - no correlation.  see website/custom/majority-rebellions.png

# First term MPs vs. old warhorses.  More rebellious?  Less attentive?

# Find people who have telled the most times

# Cluster distractions (geek fun, but pointless)
# --------------------

# Make clustering cope with tellaye/tellno
# Make cluster stuff store NUMBER OF VOTES both voted in for extra possible friends info

# Chris Lighthead:  "I've now written some code to estimate which of the
# eigenvectors are significant. The basic idea is that we generate
# synthetic data using the marginal  distributions for each statements --
# that is, like the data which would have been produced by the same number
# of respondents as have completed the real survey, but  as if their
# answers to any one question were unrelated to all the others; and having
# done that, we perform the principal components analysis on the synthetic
# data. The idea here is that we can compare the eigenvalues from the
# synthetic data to the eigenvalues from the real data. If the real
# eigenvalue is significantly larger than the one from the synthetic data,
# it likely represents real variation in the data; otherwise, random
# variation." - we could do this with MP clustering.

# Improve clustering distance algorithm
#     See J Vaughan suggestions

# Colour dots in cluster diagram by how many times they have voted.
# Bright colours for more relevant the data - i.e. how many intersections
# with other's votes there are.

# Play with stuff in vector search article
# http://www.perl.com/pub/a/2003/02/19/engine.html
# In particular PDL for speeding up octave algebra stuff

# >    Idea 2.  Darren suggested that the reason Tony Blair is an outlier
# >    in the java app is coz he only turns up to votes he thinks are
# >    going to be controversial, hence ones that people are probably
# >    going to vote against him.
# Find a metric to see if this is the case.

# Make cluster diagram for just divisions relating to one issue.  Or
# for one person's interested issues.  Plot point on cluster diagram for
# issues themselves.
# [I've tried this, with not that useful results - Francis:
# - I've made a cluster diagram with just the Iraq votes in.
#   Unfortunately it isn't really revealing, kind of curious, but I'm not
#   sure how useful it is in the media.  Maybe you have suggestions
#   otherwise!  Find a static screenshot attached.

#   Anti-war on the left, pro-war on the right.  Colours represent
#   political party.  Unfortunately, lots of the dots represent dozens of
#   MPs when they all voted the same, so the distribution is much denser
#   to to the right than it at first appears.  It's interesting that Con
#   and Lab still separate out top to bottom... We almost end up with four
#   corners.

#   Top-left:  Anti-war, anti-government
#   Top-right: Pro-war, anti-government
#   Bottom-left: Anti-war, pro-government
#   Bottom-right: Pro-war, pro-government

#   However, this doesn't really tell us anything we didn't already know. ]

# EDM analysis with MDS:
# http://www.stats.bris.ac.uk/~magpn/Research/Politics/

# Written Answers
# ---------------

# Here's a couple of ideas: a section called "answers that are in the Library".

# Also, if we find enough funny business, what about, random question of
# the day?  People could subscribe to this, and this would get them involved.


# Miscellaneous
# -------------

# Create classification tree so it offers some division questions for you
# to pick from, and tells you your party.  As does here for EDMs:
# http://www.stats.bris.ac.uk/~magpn/Research/Politics/whovote.htm

# Find "Motion made, and Question proposed," and "Question put and agreed
# to." when there was no division.  Record these "virtual divisions" as
# divisions, as they are really, just they were totally uncontested for
# whatever reason.  Count them up and see where and how many there are,
# whether there are more near bedtime etc.

# What can we learn from the information that aye/no comes first, and
#     from the information as to whether government or not government is
#     aye or no?

# Measure lobbying power behind each issue (expenditure by interested
# parties).  Again, correlate to time spent on it.

# Value of the vote.  What is the monetary expenditure cost of agreeing
# the motion?  Graph against time spent discussing, and see how silly the
# correlation is.

# Measure a sudden drop in attendance rate - so you could see that the MP
# used to vote a certain % of the time and now doesn't.  This will 
# detect illness/injury/busy for some other reason.

In [6]:

# https://www.publicwhip.org.uk/data/ideas.txt
    
# Data anlaysis (using existing data)
# -------------

# Rebels who voted against this division, also voted against this division...
# This is flipping the possible friends on its head.  Find friends between
# divisions.  Distance metric between divisions - how many MPs (out of
# MPs that were present in both divisions) voted the same.

# "Performance tests" for government - turning excessive monitoring and
# testing back onto them.
# corruptometer, loyaltometer, evilness, sleepometer, waffle-meter
# Top day of week for voting.  Distribution for individual MPs.
# - this person is never there on Thursdays!

# Use abstention rates to highlight interesting divisions, where lots of
# people didnt turn up.
# Pick divisions with extreme "Extra Turnouts" (for the main parties) -
# these will be interesting
# Plot "Extra Turnout" over time - so can see how abstention rates change.

# Show the majority for each vote (how much it was won by)

# Additional numeric data
# -----------------------

# Put time of division, so can work out importance (late at night less
# governmental)
# Also length of debate leading up to division
# Intended length of debate

# It is worth looking for MPs who spoke but did not vote.  This is a good
# way to detect active abstentions.  It may also have all sorts of other
# interesting meanings.
# division.php?date=2003-06-10&number=224&showall=yes
# Count how many times MP spoke in a debate, or on the day

# How many articles do Labour MPs supply to the Times, for example?
# Collate all MPs articles in newspapers

# Provide local program for pressure groups.  Parses their list of
# post-code membership, and creates a table of MPs who are doing the least
# where the group has the most supporters.  Finds good targets for
# lobbying.

# Correlate voting record with geographical location.  Do MPs at constituencies
# near each other tend to vote the same as each other?

# Cost in pounds / word of an MP

# Additional text content
# -----------------------

# House of Lords divisions

# When Dream MP gives votes more context, allow anyone to vote on any division.  
# Perhaps they have to give their postcode first.

# Issue sub-selector.  User can log in, name an issue, and say which way
# votes should have gone to satisfy him on that issue.  Get all manner of
# people to make issues for next general election.
# - info to help select which votes and which way should have voted
# - SP adopters / constituency
# - Auto email to adopters in each ward
# - include journalism, so a comment on the issue / on each vote
# - who made the issue is crucial, as some expertise is needed, and
#   viewers of the site need to understand any bias that they have
# - Report to take into your MP constituency surgery
# - Let NGO entere postcodes of their supporters (in their own client app)
#   and use these to find MPs where the most people agree with the issue,
#   but the MP has voted the other way.
# Do Iraq subselection ourselves
# Do climate change subselection

# Software to follow all legislation sources:
#      - parliamentary bills - which can be introduced into either House
#      - white papers - on the parliamentary website I think
#      - green papers - not sure
#      - other consultative papers issues by departments
#      - consultative papers issues by other statutory bodies such as regulators
#      - papers from the downing st Performance and Innovation Unit
#      - EDMs  
#      - private members bills
#      - 10 minute rule bills
#      - EC directives which have to be implemneted in domsetic law
#      - the Queen's speech
#      - international treaties which apart from major EU treaties are ratified
#      under the Ponsonby Rule without a vote in parliament.

# Link from division to draft of Bill which is being debated
#     http://www.parliament.uk/what_s_on/what_s_on.cfm
#     http://www.parliament.the-stationery-office.co.uk/pa/cm200203/cmstand/cmstand.htm
#     http://www.parliament.the-stationery-office.co.uk/pa/pabills.htm
#     http://bills.ais.co.uk
#     http://news.bbc.co.uk/1/hi/programmes/bbc_parliament/3370535.stm

# Group votes by Whitehall department, so you can see areas of interest
# (idea from Sirius at Oxfam)

# Put in EDMs, so can see who supports an EDM and follows it through to
# actual vote (idea from Sirius at Oxfam)
# http://edm.ais.co.uk/

# Analyse an MP's interests by their rebellions - work out a summary of
# their special interests.  Also link to which debates they spoke in.

# Committees, and voting in committee

# Written answers
# ---------------

# Print department
# Match office (secretary of state...) to department?  Julian does this already
# prob, check how it works and use it in all the tables, as it is useful at-a-glance
# information.

# uk.org.publicwhip/wrans/2003-11-06.755W.4 - refers to uk.org.publicwhip/wrans/2003-06-03.214W
# which does actually exist, but crosses over two entire columns, so referencing the one in the middle doesn't pick it up.  Ooops.  Need some kind of ranging?
# uk.org.publicwhip/wrans/2003-11-06.726W.0 - there is a phrase referring to debates, but the
# link is given to wrans ;)
# uk.org.publicwhip/wrans/2003-11-06.725W.2 - has a offrep which isn't there (as well
# as two which we don't have indexed yet)
# uk.org.publicwhip/wrans/2003-10-27.110W.2 - fails to find offrep

# Back links to wrans that have referred to this one
# Check all found URL links

# Improve all MP name matching so no unknown ids
# Make "unknown" name link a better page (rather than a meaningless error)
#     Fix broken links to mp.php?id=unknown

# Stats - count up how many questions/answers each person has done
# Some kind of "most inquisitive" on front page

# League table of unanswered written questions

# Search for all questions that are about Hansard that you can now answer, and send off the answers.

# Give help on how to request answers from hcinfo

# Usability
# ---------

# Policy syndication:
# http://www.livejournal.com/developer/embedding.bml
# Javascript, make .js file with just a "document.write"
# Inline frames, just inline to HTML file
# CGI script / serverside include

# List of all divisions and how an MP voted in them, including ones he
# wasn't present at.  So you could see which ones they weren't at easily.
# Kind of makes sense.

# Abstain -> difference from expected
# Expected -> average no. of attendees
# visually draw guesses further right

# Pass W3C validator
# http://validator.w3.org/
# And bobby

# Letter index on MP list

# Pie chart, rectangle graph by area for voting table

# Email reports to people when search queries change
# e.g. When your MP has voted.  When he has rebelled.  When an issue is
# voted on, and so on.
# RSS feed of your MP's votes

# Link from MP to other sources of info 
#     - Ask Aristotle
#     - Fax your MP
#     - Speeches on Hansard
# Link from search engine to
#     - link to search Hansard?

# Links to other political resource websites

# Print reason left parliament in table at top of mp.php

# links.php - takes links to days and chunks, does a redirect reduce
# bandwidth, and do tracking of where people link through to

# Put divisions on same day as easy links from division page
# Put similar divisions on links

# META keywords (for search engine description)

# Colour blind people, or indeed blind people, need a better rebel marker
# than redness in MPs division list.  Boldness is one idea.

# Log failed searches so we can improve the search engine

# mod_gzip to reduce bandwidth
# Make sure fast CGI keeps db connections somehow
# Share DB connection with main code and registration code
# Use hotcopy or similar for new db uploading
#     http://www.mysql.com/doc/en/mysqlhotcopy.html

# Paragraph text needs max-width, so it can be shrunken

# - about the authors, so feels personal to people
# - consider breadcrumb trail
# - about section (not all FAQ?)
# - company name/logo at topleft, search at topright
# - print stylesheet media="print" removing menus

# > (One comment: obviously all your pages are going to get
# > indexed in, e.g., Google fairly shortly. For some search
# > terms -- names of obscurer MPs and of pieces of
# > legislation -- you'll be quite high up the results. It
# > might be worth putting a one-sentence description of the
# > project at the top of each results page to explain what's
# > going on, since some of the pages might look a touch
# > confusing for the first-time visitor.)

# Political toolbar - easy searches.  Popup links automatically on MP's
# names in Internet Explorer.

# Java in Opera Win32

# Gimmicks
# --------

# MP wallplanner - show their diary

# Fantasy party/cabinet, based on voting.
# Virtual top trumps - choose your favourites, play them, lose them, try
# to get ones whose careers are on the up.

# Real top-trump cards (rebelliousness, attendance, term of office, majority, age...)
# Pick, say, 1997 parliament - or maybe becomes more journalistic/historical,
# as you want the cards to represent the MP at their most significant moment
# in time.

# Actually post a whipping sheet to MPs.  This would arrive every week at
# the same time as their party whipping sheet.  It would tell them how
# many voters in their constituency have registered with organisations which
# would like them to vote particular ways.
# Decentralised - get our mailing list subscribers to print the PDF and
# post it to them.

# Local newspapers - similarly contact them.

# Make big wall chart of cluster diagram - colour, pretty
# Maybe even sell it to people

# Stand as Public Whip candidate in election with manifesto to do as your
# constituents want.  

# "Totally toady" to "Usual suspect"

# Broader context
# ---------------

# European Parliament.  
# Minutes here:
# http://www3.europarl.eu.int/omk/omnsapir.so/calendar?APP=PV1
# Attendance rates are pre processed here:
# http://www.europarliament.net/

# Do it for all Hansards in CHEA:
# http://www.hansard-westminster.co.uk/links.asp

# Indian parliament.  Give them clustering.
# Form UK equivalents of:
# "Committee for Assurances" - track all promises
# "Committee for Attendance" - see which seats should be declared vacant
# "Spectrum of Interest" - what subjects in career they followed, so can
# tell if would be no use as Minister for Education
# Analyse all task MP does, time spent, and see if they are doing either
# enough or implausibly too much.

# Dublin parliament
# http://www.oireachtas-debates.gov.ie/

# Letter writing campaign by their constituents to MPs who use procedure
# in an entirely negative way - for example here, when Labour maliciously
# moved the house to sit in private, in order to stop the democratic
# discussion of the role of ministerial advisers
# http://www.publications.parliament.uk/pa/cm200203/cmhansrd/cm030314/debtext/30314-19.htm

# About one MP
# ------------

# Name (title, first, last)
# Photo
# Party (link to party website)
# Parliamentary terms served 
# Failed candidacy
# Home page
# Constituency address / phone / fax
# Parliament address / phone / fax
# Email
# Surgery times
# Date of birth
#     http://www.parliament.uk/directories/hciolists/alms.cfm
#     http://www.psr.keele.ac.uk/area/uk/mps97.htm
#     http://www.psr.keele.ac.uk/area/uk/mps.htm
#     http://www.election.demon.co.uk/strengths.html

# Bio (journalistic)
#     http://www.politicallinks.co.uk
#     http://www.dodonline.co.uk/politics2/BIOG/MP_BIOGS/bio.asp?id=0163

# Salary
# Expenses - London stay (£150/night!), motoring
# Pension
# Include pay from members interests
# http://www.parliament.uk/faq/pay_faq_page.cfm

# Register of interests
#     http://www.parliament.the-stationery-office.co.uk/pa/cm/cmregmem
# Donations they have made (to political parties?)
#     http://www.cleanpolitix.com
# Entry from Who's who
# Company directorships
# Links to info on companies involved with

# Recent speeches (text)
#     http://www.publications.parliament.uk
# Voting record
#     http://www.publications.parliament.uk/pa/cm/cmhansrd.htm
# Committee membership
# Bills placed before parliament
# Video of them in parliament (or elsewhere)
# Written questions asked and answered

# Local council positions
# Mayorships

# Responsiveness to faxes (also photo?)
#     http://www.faxyourmp.com/stats.php3
# Voting activity
# Number of questions asked
# Time spent in parliament

# News mentions (Google news)
# Weblog mentions
# Search engine hits
# Links to their home page

# Comments/user provided links
# As much data historical, as well as current

# Sources:
# http://politics.guardian.co.uk/person/0,9290,-816,00.html
# http://www.epolitix.com

In [7]:

# dfs = []
# meta_dict = {}
# descr_dict = {}
# column_dict = {}
# count = 0

In [9]:

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".dat"): 

        file_name_base = filename.split(".dat")[0]
#         file_name_base = "votematrix-2010"


        df = pd.read_csv(PublicWhip_folder+file_name_base+'.dat',sep='\t')#,engine='python'

        # MP votes for each division
        # The .dat files are tab-separated text files for loading into a spreadsheet.
        # They contain a matrix of every vote of each MP/Lord in each division.
        # The columns are headed by the identifiers of the MPs/Lords, and the rows begin with the date,
        # number and title of the division. Each .txt file explains what number represents aye, no, abstain
        # and so on, and gives a key to the MP/Lord identifiers.

        # Divisions x MPs

        df[search(df,"mpid").index] = df[search(df,"mpid").index].replace(replace_dict).astype('category')
        df["date"]=df["date"].apply(lambda x: pd.to_datetime(x))
        
        df.to_csv(PublicWhip_folder+file_name_base+"_matrix.csv")

        df2 = pd.read_csv(PublicWhip_folder+file_name_base+'.txt',sep='\r\n',header =None)
        file_string = df2.loc[0].values[0]
        file_created_string = df2.loc[1].values[0]
        num_mps = int( df2.loc[2].values[0].split(" mps")[0] )
        num_divisions_string = int( df2.loc[3].values[0].split(" divisions")[0] )
        print(file_string)
        print(file_created_string)
        print(num_mps)
        print(num_divisions_string)

        df3 = pd.read_csv(PublicWhip_folder+file_name_base+'.txt',sep='\t',skiprows =19)
        df3["party"] = df3["party"].astype('category')
        # party - object -> categorical
        # mpid - unique int64
        df3 = df3[['mpid', 'firstname', 'surname', 'party']]
        
        df3.to_csv(PublicWhip_folder+file_name_base+"_mps.csv")

        # https://www.publicwhip.org.uk/mp.php?mpid=40289

file: votematrix-1992.csv covering commons parliament 1992
file created: Wed Jul 18 11:17:58 2018by http://www.publicwhip.org.uk/
655
0
file: votematrix-1997.csv covering commons parliament 1997
file created: Mon Jan  6 19:21:32 2020by https://www.publicwhip.org.uk/
679
1273
file: votematrix-2001.csv covering commons parliament 2001
file created: Mon Jan  6 19:21:30 2020by https://www.publicwhip.org.uk/
672
1246
file: votematrix-2005.csv covering commons parliament 2005
file created: Mon Jan  6 19:21:27 2020by https://www.publicwhip.org.uk/
657
1288
file: votematrix-2010.csv covering commons parliament 2010
file created: Thu Jan  2 19:20:37 2020by https://www.publicwhip.org.uk/
664
1226
file: votematrix-2015.csv covering commons parliament 2015
file created: Mon Jan  6 19:21:22 2020by https://www.publicwhip.org.uk/
654
467
file: votematrix-2017.csv covering commons parliament 2017
file created: Mon Jan  6 19:21:20 2020by https://www.publicwhip.org.uk/
1299
465
file: votematrix-lords.csv covering lords
file created: Mon Jan  6 19:21:14 2020by https://www.publicwhip.org.uk/
1304
2232

In [ ]:

# df[search(df,"mpid").index].stack().value_counts()

In [ ]:

# df["date"].value_counts()

In [ ]:

# df["voteno"].value_counts()
# rowid unique
# date - object -> datetime, not unique
# voteno - not unique, 1-544
# Bill - object, not unique

In [ ]:

# df3["party"].value_counts()

In [ ]:

# df[search(df,"mpid").index].stack().value_counts()

In [ ]:

In [10]:

file_name_base = "votematrix-2015"
matrix = pd.read_csv(PublicWhip_folder+file_name_base+"_matrix.csv", index_col = "Unnamed: 0")
mps = pd.read_csv(PublicWhip_folder+file_name_base+"_mps.csv", index_col = "Unnamed: 0")


# MPS are doubled - looks like it's so it stores their behaviour before/after splits
matrix.shape,mps.shape

Out[10]:

((467, 659), (654, 4))

In [11]:

# file_name_base = "votematrix-2017"
# matrix = pd.read_csv(PublicWhip_folder+file_name_base+"_matrix.csv")
# mps = pd.read_csv(PublicWhip_folder+file_name_base+"_mps.csv")


# # MPS are doubled - looks like it's so it stores their behaviour before/after splits

In [12]:

# df = pd.get_dummies(matrix[search(matrix,"mpid").index]).T
# df_cols = df.columns
# df_ind = df.index

In [13]:

replace_dict_2 = {"no":0,
 "aye":1,
 "missing":np.nan,
 "tellaye":1,
 "tellno":0,
 "both":np.nan}
mps["name"] = mps["firstname"]+"_"+mps["surname"]
sorted_mp_num_ids = [int(x.replace('mpid','')) for x in search(matrix,"mpid").index]
mps = mps.set_index("mpid").loc[sorted_mp_num_ids]
matrix["uniqueBillName"] = matrix["Bill"]+"(voteno:"+matrix["voteno"].astype('str')+")"

In [14]:

matrix[search(matrix,"mpid").index].stack().value_counts()

Out[14]:

no         113372
aye        106755
missing     83473
tellno        880
tellaye       880
both           58
dtype: int64

To run PCA on this dataset we need to turn it into a fully filled in numerical matrix.

Two options

dummy every variable (for each bill, one column for bill_aye, bill_missing, bill_no, bill_tellaye ..)
aye=1, no =0, treat tell_aye/tell_no as aye/no and fill in missing/both values with bill mean

In [68]:

df = matrix[search(matrix,"mpid").index].T
df.columns = matrix["uniqueBillName"]
df = pd.get_dummies( df )
df.index = [int(x.replace('mpid','')) for x in df.index]
df.index = [mps.loc[x,"name"] for x in df.index]

df_cols = df.columns
df_ind = df.index

df.head()

Out[68]:

	Section 5 of the European Communities (Amendment) Act 1993(voteno:198)_aye	Section 5 of the European Communities (Amendment) Act 1993(voteno:198)_no	Technical and Further Education Bill - Financial Support for Students Undertaking Apprenticeships(voteno:197)_aye	Technical and Further Education Bill - Financial Support for Students Undertaking Apprenticeships(voteno:197)_no	...	Business of the House - The Economy(voteno:2)_aye	Business of the House - The Economy(voteno:2)_missing	Business of the House - The Economy(voteno:2)_no	Devolution and Growth Across Britain(voteno:1)_aye	Devolution and Growth Across Britain(voteno:1)_no
Bridget_Phillipson	0	1	0	1	...	1	0	0	1	0
Julie_Elliott	0	1	0	1	...	0	1	0	1	0
Sharon_Hodgson	0	1	0	1	...	1	0	0	1	0
Justin_Tomlinson	1	0	1	0	...	0	0	1	0	1
Justine_Greening	1	0	1	0	...	0	1	0	0	1

5 rows × 2304 columns

In [ ]:

In [69]:

df = matrix[search(matrix,"mpid").index].replace(replace_dict_2).T
df = df.fillna(df.mean())
df.index = [int(x.replace('mpid','')) for x in df.index]
df.index = [mps.loc[x,"name"] for x in df.index]
df.columns = matrix["uniqueBillName"]

df_cols = df.columns
df_ind = df.index

df.head()

Out[69]:

uniqueBillName	Section 5 of the European Communities (Amendment) Act 1993(voteno:198)	Technical and Further Education Bill - Financial Support for Students Undertaking Apprenticeships(voteno:197)	Early Parliamentary General Election(voteno:196)	Finance (No. 2) Bill(voteno:195)	Finance (No. 2) Bill(voteno:194)	Pension Schemes Bill [Lords] - Systems and processes requirements(voteno:193)	Pension Schemes Bill [Lords] - Member trustees(voteno:192)	Pension Schemes Bill [Lords] - Funder of the last resort(voteno:191)	Bus Services Bill [Lords] - Bus companies: limitation of powers of authorities in England(voteno:190)	Bus Services Bill [Lords] - Report on the provision of concessionary bus travel to apprentices aged 16 to 18(voteno:189)	...	Scotland Bill - Clause 11 - Scope to modify the Scotland Act 1998(voteno:10)	Scotland Bill - Clause 5 - Timing of elections(voteno:9)	Scotland Bill - Part IIA - UK Pensions liability(voteno:8)	Opposition Day - [1st allotted day] - Housing(voteno:7)	European Union Referendum Bill(voteno:6)	European Union Referendum Bill(voteno:5)	Business of the House - The Economy(voteno:4)	Business of the House - The Economy(voteno:3)	Business of the House - The Economy(voteno:2)	Devolution and Growth Across Britain(voteno:1)
Bridget_Phillipson	0.0	0.0	1.0	0.0	0.148649	1.00000	1.00000	1.000000	1.000000	1.000000	...	0.16622	1.0	1.0	0.466997	1.0	0.151741	0.000000	0.158163	1.000000	1.0
Julie_Elliott	0.0	0.0	1.0	0.0	0.148649	0.39749	0.39375	0.443595	0.405983	0.411392	...	0.16622	1.0	1.0	1.000000	1.0	0.151741	0.539474	0.158163	0.461285	1.0
Sharon_Hodgson	0.0	0.0	1.0	0.0	0.148649	1.00000	1.00000	1.000000	1.000000	1.000000	...	0.16622	1.0	1.0	1.000000	1.0	0.151741	0.000000	0.158163	1.000000	1.0
Justin_Tomlinson	1.0	1.0	1.0	1.0	0.000000	0.00000	0.00000	0.000000	0.000000	0.000000	...	0.00000	0.0	0.0	0.000000	1.0	0.000000	1.000000	0.000000	0.000000	0.0
Justine_Greening	1.0	1.0	1.0	1.0	0.000000	0.00000	0.00000	0.000000	0.000000	0.000000	...	0.00000	0.0	0.0	0.000000	1.0	0.000000	0.539474	0.158163	0.461285	0.0

5 rows × 467 columns

In [ ]:

In [17]:

output_subfolder = create_subdir(PublicWhip_folder,"DimensionalReduction")
manifest = None

In [18]:

def clean_filename(filename, whitelist=valid_filename_chars, replace=' ', char_limit = 30):
    import warnings
    # replace spaces
    for r in replace:
        filename = filename.replace(r,'_')
    
    # keep only valid ascii chars
    cleaned_filename = unicodedata.normalize('NFKD', filename).encode('ASCII', 'ignore').decode()
    
    # keep only whitelisted chars
    cleaned_filename = ''.join(c for c in cleaned_filename if c in whitelist)
    if len(cleaned_filename)>char_limit:

        warnings.warn("Warning, filename truncated because it was over {}. Filenames may no longer be unique".format(char_limit))
        # print("Warning, filename truncated because it was over {}. Filenames may no longer be unique".format(char_limit))
    return cleaned_filename[:char_limit]

In [19]:

def display_components(n_components, decomp, cols, BES_decomp, manifest, 
                       save_folder = False, show_first_x_comps=4,
                       show_histogram=True, flip_axes=True):
    
    if hasattr(decomp, 'coef_'):
        decomp_components = decomp.coef_
    elif hasattr(decomp, 'components_'):
        decomp_components = decomp.components_
    else:
        raise ValueError('no component attribute in decomp')    

    # hardcoded at 20?    
    n_comps = min(n_components,20)
    comp_labels = {}
    comp_dict = {}

    for comp_no in range(0,n_comps):

        fig, axes = plt.subplots(ncols=1+show_histogram)
        
        comp = pd.DataFrame( decomp_components[comp_no], index = cols, columns = ["components_"] )
        comp["comp_absmag"] = comp["components_"].abs()
        comp = comp.sort_values(by="comp_absmag",ascending=True)        
        
        if show_histogram:
            comp_ax = axes[0]
            
            hist_ax = axes[1]
            hist_ax.set_xlabel("abs. variable coeffs")
            hist_ax.set_title("Histogram of abs. variable coeffs")
            comp["comp_absmag"].hist( bins=30, ax=hist_ax, figsize=(10,6) )
            
        else:
            comp_ax = axes
            
        # set top abs_mag variable to label
        comp_labels[comp_no] = comp.index[-1:][0] # last label (= highest magnitude)
        # if top abs_mag variable is negative
     
        if flip_axes & (comp[-1:]["components_"].values[0] < 0):

            comp["components_"]         = -comp["components_"]
            decomp_components[comp_no]  = -decomp_components[comp_no]
            BES_decomp[comp_no]         = -BES_decomp[comp_no]

        if manifest is not None:
            dataset_description = manifest["Friendlier_Description"].values[0]+ "\n"
            dataset_citation = "Source: " + manifest["Citation"].values[0]
            comp_ax.annotate(dataset_citation, (0,0), (0, -40),
                             xycoords='axes fraction', textcoords='offset points', va='top', fontsize = 7)             
        else:
            dataset_description = ""
        title = "Comp. "+str(comp_no)+" (" + str( comp.index[-1:][0] ) + ")"
        comp_labels[comp_no] = title
        comp_ax.set_title( dataset_description + title )
        comp_ax.set_xlabel("variable coeffs")
        xlim = (min(comp["components_"].min(),-1) , max(comp["components_"].max(),1) )
        comp["components_"].tail(30).plot( kind='barh', ax=comp_ax, figsize=(10,6), xlim=xlim )


        if (save_folder != False):

            fname = save_folder + clean_filename(title, char_limit=60) + ".png"
            fig.savefig( fname, bbox_inches='tight' )
        else:
            title = "Comp. "+str(comp_no)+" (" + str( comp.index[-1:][0] ) + ")"
            comp_labels[comp_no] = title
            comp_ax.set_title( title )
            comp_ax.set_xlabel("variable coeffs")    
            xlim = (min(comp["components_"].min(),-1) , max(comp["components_"].max(),1) )
            comp["components_"].tail(30).plot( kind='barh', ax=comp_ax, figsize=(10,6), xlim=xlim )
            
        comp_dict[comp_no] = comp
        # show first x components
        if (comp_no >= min(show_first_x_comps,n_components)):
            plt.close()

        
    return (BES_decomp, comp_labels, comp_dict)

For people who don't want to destroy their eyesight, these components seem to be (0) Government vs Opposition (1) Scottish Independence vs Unionism (2) Domestic Violence (I assume "anti" vs "less anti" rather than pro) (3) Northern Ireland/Details of Brexit legislation

In [20]:

# %%time
from sklearn.decomposition import TruncatedSVD
# >>> from scipy.sparse import random as sparse_random
# >>> from sklearn.random_projection import sparse_random_matrix
# >>> X = sparse_random(100, 100, density=0.01, format='csr',
# ...                   random_state=42)

# normalise data (subtract out the mean, divide through by standard deviation)
clean_feature_set_std = StandardScaler().fit_transform(df.values )
BES_std = pd.DataFrame(      clean_feature_set_std,
                             columns = df_cols,
                             index   = df_ind      )


# 5 mins
## RUN (some variant of) PCA (potentially v slow!)
n_components = min(BES_std.shape[0],BES_std.shape[1])
# n_components = 20
# (svd_solver='full', n_components='mle',whiten=True)
# decomp = PCA(n_components = n_components,svd_solver='full')
# decomp = FastICA(algorithm='deflation', fun='exp', fun_args=None, max_iter=1000,
#     n_components=None, random_state=None, tol=0.07, w_init=None, whiten=False) # 2h 1min 4s "fast"
# decomp = SparsePCA(n_components=n_components, alpha=2,max_iter=1000,n_jobs=4,tol=1e-10, verbose=True) # 5min
#
# alpha=2 -> 1hr

decomp = FactorAnalysis(svd_method = 'lapack',n_components = n_components) ## ~10s ,n_components=30 -> 1.5 hrs
# decomp = TSNE(n_components=n_components, verbose=1, perplexity=40, n_iter=300)
# decomp = TruncatedSVD(n_components=n_components)
decomp_method = str(decomp).split("(")[0] 
# ,n_components=30

X_r = decomp.fit_transform(BES_std)

BES_decomp = pd.DataFrame(   X_r,
                             columns = range(0,n_components),
                             index   = df_ind)


# treatdir = BES_data_folder+Treatment
# subdir = BES_data_folder+Treatment+decomp_method
load_suff = "FactorAnalysis"
save = True # False => Load

if save & ( 'decomp' in globals() ): # SAVE    ##( 'decomp' not in globals() )
    decomp_method = str(decomp).split("(")[0] 
    subdir = output_subfolder + decomp_method
    fname = subdir+ os.sep + decomp_method
    # create dir, save decomp object, BES_decomp, BES_std    
    if not os.path.exists(subdir): os.makedirs(subdir)
    with open(fname+".pkl", "wb") as f: pickle.dump( decomp, f )
    BES_decomp.to_hdf(fname+".hdf"        , decomp_method)
    BES_std.to_hdf(   fname+"_std"+".hdf" , decomp_method)
    
else: # LOAD decomp results (default is SAVE)
    decomp_method = load_suff
    subdir = output_subfolder + os.sep + decomp_method    
    fname = subdir + os.sep + decomp_method
    if not os.path.exists(subdir): raise Exception(subdir + ' does not exist!')
    # load decomp object, BES_decomp, BES_std, n_components
    with open(fname+".pkl", "rb") as f: decomp = pickle.load(f) 
    BES_decomp = pd.read_hdf(fname+".hdf")
    BES_std    = pd.read_hdf(fname+"_std"+".hdf")
    n_components = decomp.components_.shape[0] 

    
(BES_decomp, comp_labels, comp_dict) = display_components(n_components, decomp,
                                                          df_cols, BES_decomp, manifest, 
                                                          save_folder = subdir,  
                                                          show_first_x_comps= 4, show_histogram = False)

In [ ]:

In [21]:

# sns.lineplot(x=range(0,len(decomp.explained_variance_[0:10])),y=decomp.explained_variance_[0:10])

In [22]:

party_colour_dict = {"Lab":"red","LAB":"red",
                     "Con":'blue',"CON":'blue',"UKIP":"purple","LDem":"orange",
                     "Grn":'green',"Green":'green',
                     "BrexitParty":"cyan","BXP":'cyan',
                     "TIG":"pink","BNP":"magenta",
                     "SNP":'yellow',"PC":'olive',
                     "SPK":'black',"SPE":'black',
                     "OTH":'brown',
                     "UNKNOWN":'grey',"Independent":'grey',
                     "DUP":'magenta',"SDLP":'pink','UUP':'lightblue',
                     }

# mps["party"].replace(party_colour_dict).value_counts()

The motivation for looking at this data came from an article by Marie Le Conte and follow-up comments by Stephen Bush criticising the way data is presented by the They Work For You website.

A lot of the time when humanities grad journalists** complain about data lacking vital context it's an unsubtle mixture of (i) you can't capture the human soul with numbers! (ii) can a computer read their wiki page, recount anecdotes from meeting an MP at a party and balance a ball on its nose? can it huh huh? You can't fire me!

But in this case the points raised are not just on-the-nose questions you'd get from a room of full of quants - they are also things you can directly address with a little bit of code.

From Marie Le Conte's article: "Still, there is a bit of an issue: the screengrabs from TWFY are often misleading. Take James Heappey as an example. The Conservative MP has, according to the website, “consistently voted against measures to prevent climate change”. ... This seems odd. Heappey used to sit on the energy and climate change select committee and currently chairs the all-party parliamentary group on renewable and sustainable energy, of which Green MP Caroline Lucas is also a member. He is heading a project for Carbon Connect, which “seeks to inform and guide a low carbon transformation underpinned by sustainable energy”, and had Ed Miliband as a keynote speaker when it launched. It could well be that Heappey is two-faced and talks the talk without walking the walk; it wouldn’t be a first for a politician. Let’s take a closer look anyway. In all of those votes but one, every Conservative MP voted the same way, and in one of them, two Conservative MPs voted the other way. There is no record of party whips on TWFY, but we can reasonably assume that those votes were whipped, so all Tory MPs were instructed to vote a certain way. Then, it seems worth pointing out that five of the nine votes were parts of summer budgets and finance bills, so details in much wider legislation set by the chancellor of the exchequer. As with other major votes, it would be extremely unusual for an MP from a governing party to vote against their budget."

** (actually have no idea about Marie Le Conte's ed background)

This is eminently doable with the data available - it's very hard to understand the exact context of a given vote ... but it's real easy to see how many people voted against it.

You don't even need to have access to whether or not a vote was whipped - you can just look at the pattern of voting to see whether it divides parties.

Case in point - using only voting pattern data - without including any reference to which party MPs are nominally part of (except for colouring them in the chart) you can easily cluster MPs in a way which indicates the de facto party whip.

In [23]:

plt.figure(figsize=(12,8))
sns.scatterplot(x=BES_decomp[0],y=BES_decomp[1], hue = mps["party"].values, palette=party_colour_dict );
# plt.title()

In [24]:

from sklearn.manifold import TSNE

In [25]:

import time
time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=20, n_iter=3000)#, method='exact')
# tsne_results = tsne.fit_transform(BES_std)
tsne_results = tsne.fit_transform(df)
# tsne_results = tsne.fit_transform(BES_decomp)

print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))
# BES_decomp = tsne_results
BES_decomp = pd.DataFrame(   tsne_results,
                             columns = range(0,2),
                             index   = df_ind)
# BES_decomp

[t-SNE] Computing 61 nearest neighbors...
[t-SNE] Indexed 654 samples in 0.066s...
[t-SNE] Computed neighbors for 654 samples in 0.310s...
[t-SNE] Computed conditional probabilities for sample 654 / 654
[t-SNE] Mean sigma: 0.894998
[t-SNE] KL divergence after 250 iterations with early exaggeration: 93.997574
[t-SNE] KL divergence after 3000 iterations: 1.237509
t-SNE done! Time elapsed: 19.202098608016968 seconds

In [26]:

import mpld3
# fig, ax = plt.subplots(subplot_kw=dict(facecolor='#E6E6E6'), figsize=(16,12))
fig = plt.figure(figsize=(12,8))
# plt.gca().axis("off")
# N = 100
# scatter = sns.scatterplot(x=BES_decomp[0],y=BES_decomp[1], hue = mps["party"].values, palette=party_colour_dict, ax=ax );
scatter = plt.scatter(x=BES_decomp[0],y=BES_decomp[1],c=mps["party"].replace(party_colour_dict).values, alpha=0.3);
# scatter = ax.scatter(np.random.normal(size=N),
#                      np.random.normal(size=N),
#                      c=np.random.random(size=N),
#                      s=1000 * np.random.random(size=N),
#                      alpha=0.3,
#                      cmap=plt.cm.jet)
# plt.grid(color='white', linestyle='solid')

plt.title("2015 Parliament\nMPs divided automatically by voting behaviour", size=20)

# plt.axis("off")
# mpld3.enable_notebook()


labels = mps["name"].values
tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
mpld3.plugins.connect(fig, tooltip)
# ax = plt.Axes(fig, [0., 0., 1., 1.])
# scatter.axis('off')
# plt.gca().get_xaxis().set_visible(False)
# plt.gca().get_yaxis().set_visible(False)

# fig.add_axes(ax)
# mpld3.show()
mpld3.display()
# plt.show()

Out[26]:

In [27]:

# mpld3.save_html(fig,output_subfolder+"index.html")

In [28]:

# mps["name"].value_counts()
# MPs which change party get recorded as different entities!

In [54]:

party_colour_dict.values

Out[54]:

<function dict.values>

In [64]:

from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show,  output_notebook
from bokeh.layouts import column, row
from bokeh.embed import components
from bokeh.transform import factor_cmap
from bokeh.models import HoverTool

In [67]:

df = pd.concat([BES_decomp[0],BES_decomp[1],mps.set_index("name")["party"]],axis=1)
df.columns = ["PCA 0","PCA 1","party"]
source = ColumnDataSource(df.reset_index())
# colours = mps["party"].replace(party_colour_dict).values
colors = factor_cmap('party', palette=list(party_colour_dict.values()), factors=list(party_colour_dict.keys()))

hover = HoverTool(tooltips = [('name','@name'),('party','@party')])

p1 = figure(plot_width=800, plot_height=400, title="2015 Parliament\nMPs divided automatically by voting behaviour",
            tools='box_select,pan,wheel_zoom,box_zoom,reset', active_drag="box_select")
p1.scatter('PCA 0', 'PCA 1', source=source, fill_color=colors, line_color = colors, alpha=0.8)
p1.add_tools(hover)
show(p1)

In [ ]: