https://twitter.com/HzBrandenburg/status/1266724167367438336

In [47]:

%%time
dataset_name = "W19_comb"
df_list = [ "BES_Panel" ]

%matplotlib inline 
%run BES_header.py {dataset_name} {df_list}

if "id" in BES_Panel.columns:
    BES_Panel = BES_Panel.set_index("id").sort_index()

(var_type, cat_dictionary, new_old_col_names, old_new_col_names) = get_small_files(data_subfolder, encoding)    
    
# get full set of inferred "cross wave" auth-lib/left-right values and ages
pan_dataset_allr_values = pd.read_csv(BES_small_data_files + "pan_dataset_allr_values"+".csv")
pan_dataset_ages = pd.read_pickle(BES_small_data_files + "pan_dataset_ages"+".zip", compression='zip')

var_type (7911, 14)
Wall time: 30 s

In [48]:

BES_Panel = pd.read_pickle("..\\BES_analysis_data\\"+"W19_comb"+os.sep+"BES_Panelv02",compression='zip')

BES_Panel = BES_Panel.set_index("id")
BES_Panel = BES_Panel.sort_index()
BES_Panel["id"] = BES_Panel.index

In [49]:

from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show,  output_notebook
from bokeh.layouts import column, row, layout
from bokeh.embed import components
from bokeh.models import HoverTool
from bokeh.models import Span, Label
from bokeh.models import BoxAnnotation
output_notebook()
from datetime import timedelta
from scipy.signal import find_peaks
from bokeh.models import Span
from datetime import datetime
import pytz

Loading BokehJS ...

In [50]:

def weighted_mean(x, **kws):
    val, weight = map(np.asarray, zip(*x))
    val, weight = val[~np.isnan(val)],weight[~np.isnan(val)]
    return (val * weight).sum() / weight.sum()

def datetime_weighted_mean(x, **kws):
    val, weight = map(np.asarray, zip(*x))
    val = pd.Series(val).apply(lambda x: x.timestamp() if pd.notnull(x) else np.nan)
    mask = (~np.isnan(val))
    val, weight = val[mask],weight[mask]
    result = (val * weight).sum() / np.sum(weight)
    result = datetime.fromtimestamp(result,tz=pytz.timezone('GMT')) if pd.notnull(result) else np.nan  # turn back from timestamp
    return result

# max_wave = int(re.match("W(\d+)_",dataset_name).groups()[0])
max_wave = np.max([int(x.replace("wave","")) for x in match(BES_Panel,"wave\d+$").index])
num_to_wave = {x:"W"+str(x) for x in range(1,max_wave+1)}
wts_for_wave = { "W"+str(y):[x for x in BES_Panel.columns.sort_values(ascending=False) if re.match("wt_(new|full)_W"+str(y)+"(_result)?"+"$",x)][0] for y in range(1,max_wave+1) }


waves = BES_Panel[search(BES_Panel,"wave").index].copy()
wts = BES_Panel[wts_for_wave.values()].isnull().copy()
wts.columns = waves.columns
drop_ids = BES_Panel.loc[((wts*waves).sum(axis=1)>0)].index
BES_Panel.drop(drop_ids, inplace=True)

# still leaves some with misssing

BES_Panel[list(wts_for_wave.values())] = BES_Panel[list(wts_for_wave.values())].replace(np.nan,1.0)

wave_to_date = BES_file_manifest[BES_file_manifest["Only_or_Combined"]=="Only"][["Wave No","Date_Start"]].set_index("Wave No")["Date_Start"]


### CHECK TIMES IN NEW 3 WAVES!

# fix endtimeW3 bug!
BES_Panel.loc[BES_Panel["endtimeW3"]=='1970-01-01 00:00:00',"endtimeW3"] = \
    BES_Panel[BES_Panel["endtimeW3"]=='1970-01-01 00:00:00']["starttimeW3"].values

# this is also suspect - date not unreasonable, but overlaps with a different wave! (3 days between start/end)
# 41222   2015-03-27 18:11:37.047
# Name: starttimeW5, dtype: datetime64[ns]
BES_Panel.loc[BES_Panel["starttimeW5"]=='2015-03-27 18:11:37.047000064',"starttimeW5"] = \
    BES_Panel[BES_Panel["starttimeW5"]=='2015-03-27 18:11:37.047000064']["endtimeW5"].values

# still some overlap between waves 4 and 5

midpoint_dict = {}
startpoint_dict = {}
endpoint_dict = {}
# create correct midpoints (technically we should weight these!)
n = 1
min_sample_size = 100
for wave_no in range(1,max_wave+1):
    wave = "W"+str(wave_no)
#     print(wave)

#     BES_Panel["midpoint"+wave] = pd.qcut(BES_Panel["endtime"+wave]+((BES_Panel["endtime"+wave]-BES_Panel["starttime"+wave])/2),n)
#     date_cats_dict = {BES_Panel["midpoint"+wave].cat.categories[x]:(BES_Panel["midpoint"+wave].cat.categories[x].left+ (BES_Panel["midpoint"+wave].cat.categories[x].right - BES_Panel["midpoint"+wave].cat.categories[x].left)/2).strftime("%Y-%m-%d") for x in range(n)}
#     BES_Panel["midpoint"+wave] = pd.to_datetime(BES_Panel["midpoint"+wave].replace(date_cats_dict))  
    BES_Panel["endpoint"+wave] = pd.to_datetime(BES_Panel["endtime"+wave]).max() 
    BES_Panel["startpoint"+wave] = pd.to_datetime(BES_Panel["starttime"+wave]).min() 
    
    BES_Panel["midtime"+wave] = (BES_Panel["starttime"+wave]+(BES_Panel["endtime"+wave]-BES_Panel["starttime"+wave])/2)
#     BES_Panel["midtime"+wave] = BES_Panel["midtime"+wave].apply(lambda x:x.replace(second=0, microsecond=0, nanosecond=0))
    BES_Panel["midpoint"+wave] = datetime_weighted_mean(zip(BES_Panel["midtime"+wave],BES_Panel[wts_for_wave[wave]]))

    startpoint_dict[wave] = BES_Panel["startpoint"+wave].dropna().values[0]
    endpoint_dict[wave] = BES_Panel["endpoint"+wave].dropna().values[0]
    midpoint_dict[wave] = BES_Panel["midpoint"+wave].dropna().values[0]

In [51]:

colour_list = ["blue","red","purple", "olive","orange","brown", "pink","cyan","grey","green"]

In [52]:

BES_Panel["id"] = BES_Panel.index

In [53]:

x_axis_label = "Response Date"
y_axis_label = "Responses/day"
plot_width=980
plot_height=600
tools='box_select,pan,xwheel_zoom,box_zoom,reset'
active_drag='pan'
active_scroll='xwheel_zoom'
title = "British Election Study Waves"
p= figure(x_axis_type="datetime", x_axis_label = x_axis_label,
            y_axis_label = y_axis_label,
            plot_width = plot_width, plot_height=plot_height,
            title = title, tools = tools, active_drag = active_drag, active_scroll=active_scroll)
p.title.text_font_size = '48pt'

date_col_dict = {"EE":'green',"GE":'red',"EU":'blue',"OB":'pink',"CL":'purple',"CAMP":'orange',"normal":'grey'}  

date_dict = {"22 May 2014":"EE","7 May 2015":"GE","23 June 2016":"EU",
             "8 June 2017":"GE","23 May 2019":"EE","21 April 2016":"OB",
             "9 October 2014":"CL","12 December 2019":"GE",
             "3 March 2015":"CAMP", "23 April 2017":"CAMP", "6 November 2019":"CAMP","15 April 2016":"CAMP"}

date_duration_dict = {"22 May 2014":1,"7 May 2015":1,"23 June 2016":1,
             "8 June 2017":1,"23 May 2019":1,"21 April 2016":2,
             "9 October 2014":1,"12 December 2019":1,
             "3 March 2015":37, "23 April 2017": 45, "6 November 2019":35,"15 April 2016":68 }
    
#     date_col_dict = {"EE":'green',"GE":'red',"EU":'blue',"OB":'pink',"CL":'purple',"CAMP":'orange'}  

wave_type_dict = dict(zip(["W"+str(x) for x in range(1,max_wave+1)],["normal"]*max_wave))

wave_type_dict["W1"] = "PRECAMP"
wave_type_dict["W4"] = "PRECAMP"
wave_type_dict["W7"] = "PRECAMP"
wave_type_dict["W11"] = "PRECAMP"
# wave_type_dict["W14"] = "PRECAMP"
wave_type_dict["W17"] = "PRECAMP"

wave_type_dict["W2"] = "CAMP"
wave_type_dict["W5"] = "CAMP"
wave_type_dict["W8"] = "CAMP"
wave_type_dict["W12"] = "CAMP"
wave_type_dict["W15"] = "CAMP"
wave_type_dict["W18"] = "CAMP"

wave_type_dict["W3"] = "POSTCAMP"
wave_type_dict["W6"] = "POSTCAMP"
wave_type_dict["W9"] = "POSTCAMP"
wave_type_dict["W13"] = "POSTCAMP"
wave_type_dict["W16"] = "POSTCAMP"
wave_type_dict["W19"] = "POSTCAMP"

wave_colour = { "PRECAMP": "red", "CAMP": "yellow", "POSTCAMP": "green", "normal":"grey" }

for wave in startpoint_dict.keys():

    my_label = Label(x=midpoint_dict[wave], y=200, y_units='screen', text=wave)
    df = BES_Panel[["midtime"+wave,"id"]].set_index("midtime"+wave).resample('1d').count()["id"]
    ind = df.index.values
    ind[0] = startpoint_dict[wave]
#     ind[-1] = endpoint_dict[wave]
    df.index = ind
    p.quad(top=df.values, bottom=0, left=df.index, right=np.append(df.index[1:].values, endpoint_dict[wave] ),
           fill_color="navy", line_color="white", alpha=0.5)

    box_left = startpoint_dict[wave]
    box_right = endpoint_dict[wave]

    box = BoxAnnotation(left=box_left, right=box_right,
                        line_width=1, line_color='black', line_dash='dashed',
                        fill_alpha=0.2, fill_color= wave_colour[wave_type_dict[wave]])
                        #(colour_list+colour_list)[int(wave.replace("W",""))] )
    
    p.add_layout(my_label) 
    p.add_layout(box)
for date in date_dict.keys():       

    if date_dict[date]=="CAMP" or date_dict[date]=="CL" or date_dict[date]=="OB":
        continue
    my_label = Label(x=pd.to_datetime(date), y=400, y_units='screen', text=date_dict[date])
    p.add_layout(my_label) 

    box_left = pd.to_datetime(date)
    box_right = pd.to_datetime(date)+timedelta(days=date_duration_dict[date])

    box = BoxAnnotation(left=box_left, right=box_right,
                        line_width=1, line_color='black', line_dash='dashed',
                        fill_alpha=0.2, fill_color=date_col_dict[date_dict[date]])

    p.add_layout(box)   

show(p)

In [54]:

def time_series(var_name,title,subtract_var=False,retain_var=True,specific_dates=True, specific_suffix_set="([a-zA-Z]*)",
                use_midpoints=False,col_name="party",dk_str="Don't know",max_y_size=10.0,min_waves_included=2,
                max_y_size_dict=None,max_wave=max_wave,use_BES_weights=True,mask=None):

    whole_wave_dk_average = True
    df=pd.DataFrame()
    df2 =pd.DataFrame()
    df_wts =pd.DataFrame()
    df_dates = pd.DataFrame()
    
    if subtract_var:
        title=title+"\n(where respondents place parties relative to their own preference set at 0.5)"

    wave_list = []
    redist_vars = pd.Series([re.match(var_name+specific_suffix_set+"($|W\d+)",x).groups()[0] for x in BES_Panel.columns\
                             if re.match(var_name+specific_suffix_set+"($|W\d+)",x)]).value_counts()
    redist_vars = redist_vars[redist_vars>=min_waves_included].index

    if mask is None:
        mask = BES_Panel["id"].notnull()
    
    for subj in redist_vars:
        for wave in ["W"+str(x) for x in range(1,max_wave+1)]:
            if var_name+subj+wave not in BES_Panel.columns:
                continue
            else:
                wave_list.append(wave)
            if max_y_size_dict:
                max_y_size = max_y_size_dict[subj]
#             df[var_name+"num_"+subj+"_"+wave] = zip(BES_Panel[var_name+subj+wave].replace(dk_str,np.nan).cat.codes.replace(-1,np.nan)/max_y_size,
#                                                  BES_Panel[var_name+subj+wave].apply(lambda x: x==dk_str if pd.notnull(x) else np.nan),
#                                                  BES_Panel[wts_for_wave[wave]]
#                                                    )
            
            df[var_name+"num_"+subj+"_"+wave] = BES_Panel[var_name+subj+wave][mask].replace(dk_str,np.nan).cat.codes.replace(-1,np.nan)/max_y_size

            
            df2[var_name+"dk_"+subj+"_"+wave] = (BES_Panel[var_name+subj+wave][mask]==dk_str).astype('float32')
            df2[var_name+"dk_"+subj+"_"+wave][BES_Panel[var_name+subj+wave][mask].isnull()] = np.nan

           
            
    for wave in pd.unique(wave_list):        
        df_wts["wt_"+wave] = BES_Panel[wts_for_wave[wave]][mask]
        
        if use_midpoints:
            df_dates["dt_"+wave] = BES_Panel["midpoint"+wave][mask]
        else:
            df_dates["dt_"+wave] = (BES_Panel["starttime"+wave][mask]+(BES_Panel["endtime"+wave][mask]-BES_Panel["starttime"+wave][mask])/2)
#             .apply(lambda x:x.date())

#     set_trace()
#     df_wts = df_wts[df.notnull().any(axis=1)]
#     df = df[df.notnull().any(axis=1)]

    df2.columns = df2.columns.str.split('_', expand=True)
    df2 = df2.stack(dropna=False)\
                .reset_index()\
                .rename(columns={'level_1':"wave"})\
                .rename(columns={'level_0':"id"}) 

    df_wts.columns = df_wts.columns.str.split('_', expand=True)
    df_wts = df_wts.stack(dropna=False)\
                .reset_index()\
                .rename(columns={'level_1':"wave"})\
                .rename(columns={'level_0':"id"}) 

    df_dates.columns = df_dates.columns.str.split('_', expand=True)
    df_dates = df_dates.stack(dropna=False)\
                .reset_index()\
                .rename(columns={'level_1':"wave"})\
                .rename(columns={'level_0':"id"}) 

    df.columns = df.columns.str.split('_', expand=True)
    df = df.stack(dropna=False)\
            .reset_index()\
            .rename(columns={'level_1':"wave"})\
            .rename(columns={'level_0':"id"}) 

    content_columns = [(var_name+"dk",x) for x in redist_vars]
    df["wt"] = df_wts["wt"]
    df[content_columns] = df2[content_columns]
    df["date"] = df_dates["dt"]
    content_columns = [(var_name+"num",x) for x in redist_vars]+[(var_name+"dk",x) for x in redist_vars]
    # only keep rows with content (variable values/dks)

    df = df[df[content_columns].notnull().any(axis=1)]

#     df = df.loc[ df[[x for x in df.columns if var_name+"num" in x]].notnull().any(axis=1) ]
    df.loc[:,"wt"] = df.loc[:,"wt"].fillna(1.0).values
    temp_ind_name = "temp_index"

#     if specific_dates:
#         df["date"] = df[["id","wave"]].merge(right=df_dates,
#                  how="left",left_on=["id","wave"],right_on=["id","wave"])["dt"].values

    df[temp_ind_name] = list(zip(df["wave"],df["wt"],df["date"],df["id"]))
    df = df.set_index(temp_ind_name).drop(["id","wave","wt","date"],axis=1)



    if subtract_var:
        if retain_var:
            focal_vars = [x for x in df.columns if (var_name+"num" in x) and (subtract_var not in x)]
            df[focal_vars] = df[focal_vars].apply(lambda x: x-df[(var_name+"num",  subtract_var)])+0.5
        else:
            focal_vars = [x for x in df.columns if var_name+"num" in x]
            df[focal_vars] = df[focal_vars].apply(lambda x: x-df[(var_name+"num",  subtract_var)])+0.5
            df.drop((var_name+"num",  subtract_var),axis=1,inplace=True)

#     df2["wt"] = df_wts["wt"]

#     df2 = df2.loc[ df2[[x for x in df2.columns if var_name+"dk" in x]].notnull().any(axis=1) ]
#     df2.loc[:,"wt"] = df2.loc[:,"wt"].fillna(1.0).values
#     if specific_dates:
#         df2["date"] = df2[["id","wave"]].merge(right=df_dates,
#                  how="left",left_on=["id","wave"],right_on=["id","wave"])["dt"].values

#     temp_ind_name = "temp_index"
#     df2[temp_ind_name] = list(zip(df2["wave"],df2["wt"],df2["date"]))
#     df2 = df2.set_index(temp_ind_name).drop(["id","wave","wt","date"],axis=1)


    flat_df_num = df.stack().reset_index().rename(columns={'level_1':col_name})

    if specific_dates:
        flat_df_num["wave"] = flat_df_num[temp_ind_name].apply(lambda x:x[0])    
    else:
        flat_df_num["wave"] = flat_df_num[temp_ind_name].apply(lambda x:int(x[0].split("W")[1]))

    flat_df_num["wt"]   = flat_df_num[temp_ind_name].apply(lambda x:x[1])
    flat_df_num["date"] = flat_df_num[temp_ind_name].apply(lambda x:x[2])
    flat_df_num["id"]   = flat_df_num[temp_ind_name].apply(lambda x:x[3])
    
    
    
    flat_df_num.drop(temp_ind_name,axis=1,inplace=True)
    flat_df_num[col_name] = flat_df_num[col_name].astype('category')

    flat_df_num[var_name+"_wts"] = list(zip(flat_df_num[var_name+"num"],flat_df_num["wt"]))
    flat_df_num[var_name+"_dk"] = list(zip(flat_df_num[var_name+"dk"],flat_df_num["wt"]))

    
#     flat_df_dk = df2.stack().reset_index().rename(columns={'level_1':col_name,0:"dk"})
#     if specific_dates:
#         flat_df_dk["wave"] = flat_df_dk[temp_ind_name].apply(lambda x:x[0])    
#     else:
#         flat_df_dk["wave"] = flat_df_dk[temp_ind_name].apply(lambda x:int(x[0].split("W")[1]))

#     flat_df_dk["wt"]   = flat_df_dk[temp_ind_name].apply(lambda x:x[1])
#     flat_df_dk["date"] = flat_df_dk[temp_ind_name].apply(lambda x:x[2])
    
#     flat_df_dk.drop(temp_ind_name,axis=1,inplace=True)
#     flat_df_dk[var_name+"dk"] = flat_df_dk[var_name+"dk"].astype('int')
    
#     if whole_wave_dk_average:
#     # calculating weighted total-wave dk average
#         flat_df_dk = flat_df_dk.groupby(["wave",col_name]).apply(lambda x: (x["wt"]*x[var_name+"dk"]).sum()/x["wt"].sum() ).reset_index().rename(columns={0:"dk"})
#         flat_df_num = flat_df_num.merge(right=flat_df_dk[[col_name,"wave","dk"]],how='left',left_on=[col_name,"wave"],right_on=[col_name,"wave"])        
#     else:
    
#         flat_df_dk["dk"+"_wts"] = list(zip(flat_df_dk[var_name+"dk"],flat_df_num["wt"]))
    

#     flat_df_num = flat_df_num.merge(right=flat_df_dk[[col_name,"wave","dk"]],how='left',left_on=[col_name,"wave"],right_on=[col_name,"wave"])
    if not specific_dates:
        flat_df_num["date"] = flat_df_num["wave"].apply(lambda x: wave_to_date[x])
        flat_df_num["date"] = pd.to_datetime(flat_df_num["wave"] , format="%b-%y")

#     raise Exception
        
    return flat_df_num, df, df2,

In [142]:

def sample_size(x, **kws):
    global max_sample_size    
    return len(x)/max_sample_size

def between_wave_retention(*args, **kwargs):
    global position_in_var_list
    var_name = kwargs['var_name'][position_in_var_list]
    position_in_var_list = position_in_var_list+1
    color = kwargs['color']
    label = kwargs['label']
    alpha = kwargs['alpha']
    df = BES_Panel[match(BES_Panel,var_name+"($|W\d+)").index].apply(lambda x: x.cat.codes.replace(-1,np.nan)).diff(axis=1).notnull().sum()/match(BES_Panel,var_name+"($|W\d+)")
    waves_present = {"W"+x.split("W")[-1]:BES_Panel["midpointW"+x.split("W")[-1]].dropna().values[0] for x in df.index}    
    df.index = [BES_Panel["midpointW"+x.split("W")[-1]].dropna().values[0] for x in df.index]
    df.drop(df.index[0],inplace=True)
    sns.lineplot(data=df,color=color,label=label,alpha=alpha)

    for wave in waves_present.keys():
        plt.text(x=waves_present[wave],y=0,s=wave, rotation=90, fontsize=12)   
        
def plot_time_series(var_name,specific_suffix_set,title,col_name,col_wrap,treatment,
                     max_wave,use_BES_weights=True,mask=None,n_boot=1,min_waves_included=2,
                     col_order=None,aspect=1):
    dk_str="Don't know"
    lowest_alpha=0.3
    low_alpha=0.5    
    date_dict = {"22 May 2014":"EE","7 May 2015":"GE","23 June 2016":"EU","8 June 2017":"GE","23 May 2019":"EE"}
    date_col_dict = {"EE":'green',"GE":'red',"EU":'blue'}    
#     print(1)
    max_y_size_dict = {x: len(BES_Panel[match(BES_Panel,var_name+x+"($|W\d+)").index[0]].cat.remove_categories(dk_str).cat.categories)-1 for x in specific_suffix_set.replace("(","").replace(")","").split("|") }
    # max_y_size=len(BES_Panel[ match(BES_Panel,var_name+specific_suffix_set+"($|W\d+)").index[0] ].cat.remove_categories(dk_str).cat.categories)-1,
    flat_df_num, df, df2, = time_series(var_name,title,specific_suffix_set=specific_suffix_set,col_name="party",
                                max_y_size_dict = max_y_size_dict,
                                dk_str=dk_str,min_waves_included=min_waves_included,use_midpoints=False,
                                max_wave=max_wave,use_BES_weights=use_BES_weights,mask=mask)
#     print(2)
#     raise Exception
    flat_df_num["wave_midpoint"] = flat_df_num["wave"].apply( lambda x: midpoint_dict[x] )
    flat_df_num["wave_startpoint"] = flat_df_num["wave"].apply( lambda x: startpoint_dict[x] )
    flat_df_num["wave_endpoint"] = flat_df_num["wave"].apply( lambda x: endpoint_dict[x] )
    
    
#     print(3)
    global max_sample_size
    global position_in_var_list
    position_in_var_list=0
#     max_sample_size = flat_df_num["dk"].value_counts().max()
    max_sample_size = flat_df_num.groupby(["party","wave"]).apply(lambda x: len(x)).max()
#     print(4)

    g = sns.FacetGrid(data=flat_df_num, col=col_name, col_wrap=col_wrap, ylim=(0.0, 1.0), legend_out=False,
                      height=height,aspect=aspect,col_order=col_order);
    
#     raise Exception
#     print(5)    
#     raise Exception    
    if use_BES_weights and (n_boot==1):
        g.map(sns.lineplot, "wave_midpoint",var_name+"_wts", ci=None, n_boot=1, label="Mean answer", estimator=weighted_mean,
              err_style="bars", markers=True, dashes=False);
        g.map(sns.lineplot, "wave_midpoint",var_name+"_dk", color='r', ci=None,n_boot=1,estimator=weighted_mean,
              err_style="bars", label="DK fraction",markers=True, dashes=False, alpha=low_alpha);        
    elif use_BES_weights and (n_boot!=1):
        g.map(sns.lineplot, "wave_midpoint",var_name+"_wts", n_boot=n_boot, label="Mean answer", estimator=weighted_mean,
              err_style="bars", markers=True, dashes=False);
        g.map(sns.lineplot, "wave_midpoint",var_name+"_dk", color='r', n_boot=n_boot,estimator=weighted_mean,
              err_style="bars", label="DK fraction",markers=True, dashes=False, alpha=low_alpha);        
        
    else:
        g.map(sns.lineplot, "wave_midpoint",var_name+"num", label="Mean answer",
              err_style="bars", markers=True, dashes=False);
        g.map(sns.lineplot, "wave_midpoint",var_name+"dk", color='r',
              err_style="bars", label="DK fraction",markers=True, dashes=False, alpha=low_alpha);        
    

#     print(7)            
    g.map(sns.lineplot, "wave_midpoint","wave", color='g', estimator=sample_size,ci=None,
          label="Sample Size\n(% of max: "+str(max_sample_size)+")",markers=True, dashes=False, alpha=low_alpha);
#     print(8)                
    g.map(between_wave_retention, "wave_midpoint","wave", color='k', var_name=[var_name+x for x in g.col_names],
          label="Retention",markers=True, dashes=False, alpha=low_alpha);
#     print(9)                

    for date in date_dict.keys():
        if flat_df_num["wave_midpoint"].min()<pd.to_datetime(date) and flat_df_num["wave_midpoint"].max()>pd.to_datetime(date):
            g.map(plt.axvline, x=pd.to_datetime(date), ls='--', c=date_col_dict[date_dict[date]], linewidth=1,
                  alpha=lowest_alpha, label=date_dict[date] )
#     print(10)              
    g.map(plt.axhline, y=0.5, ls='-.', c='grey',linewidth=1, alpha=lowest_alpha)

    g.add_legend().set_ylabels("").set_titles(col_template="{col_name}")
    g.fig.suptitle(title, y=1.0+0.03*len(title.split("\n")));
    [plt.setp(ax.get_xticklabels(), rotation=45) for ax in g.axes.flat]

    output_subfolder = create_subdir(BES_output_folder, treatment)
    g.savefig(output_subfolder +clean_filename(var_name +specific_suffix_set)+ ".png", bbox_inches='tight')
    return flat_df_num

In [171]:

# def rolling_av(party, var_name):

#     df_rolling = flat_df_num[flat_df_num["party"]==party].sort_values(by='date').copy().set_index('date')
# # df_rolling = df_rolling["immignum"].rolling(400,center=True).mean()
#     df_rolling = df_rolling[var_name].dropna().rolling(500,center=True,win_type='blackmanharris').mean()
# # win_type='gaussian').sum(std=3)
# # df_rolling = df_rolling["immignum"].rolling('1h',min_periods=10).mean()
#     return df_rolling

def weighted_mean_for_rolling(processed_dataframe_windowed):
    return np.average(a=processed_dataframe_windowed,weights=processed_dataframe_windowed.index)

def rolling_av_exerimental( party, var_name, win_type='blackmanharris',window_size=500 ):
    
    df_rolling = flat_df_num[flat_df_num["party"]==party].sort_values(by='date').copy()
    min_sample_size=200
    temp_dates = pd.qcut( df_rolling["date"], 
            int(np.floor(df_rolling["date"].notnull().sum()/(min_sample_size))) ).apply(lambda x: x.left+(x.right-x.left)/2)

    return df_rolling.groupby(temp_dates)[var_name].agg( lambda x: weighted_mean(x) )

def rolling_av( party, var_name, win_type='blackmanharris',window_size=250 ):

    df_rolling = flat_df_num[flat_df_num["party"]==party].sort_values(by='date').copy()
    fmt = "%Y-%m-%d:%H"

    # slow!
    if win_type=="BES":
        
        # this is a really dumb way to get the right date index!
        
        date_ind = df_rolling.set_index("date")[var_name].dropna().rolling(window_size,
            center=True).mean().index
#         print(date_ind.shape)
        #df_rolling["date"].copy().values
#         df_rolling.index = flat_df_num[flat_df_num["party"]==party].sort_values(by='date').copy()["date"].values
#         raise Exception
        df_rolling = df_rolling.set_index("wt")[[var_name,'date']].dropna().rolling(window_size,
            on='date',center=True)[var_name].apply(weighted_mean_for_rolling, raw=False)
#         print(df_rolling.shape)
        df_rolling.index = date_ind
#         flat_df_num[flat_df_num["party"]==party].sort_values(by='date').copy()["date"].values
#         return temp
    else:
        df_rolling = df_rolling.set_index("date")[var_name].dropna().rolling(window_size,center=True,win_type=win_type).mean()       


    df_rolling.index = [x.strftime(fmt) for x in df_rolling.index]
    df_rolling.reset_index().groupby('index').mean()
    df_rolling = df_rolling.reset_index().groupby('index').mean().dropna()
    df_rolling.index = [pd.to_datetime(x,format=fmt) for x in df_rolling.index]
    df_rolling.index.name='date'
    return df_rolling[var_name]
    

def date_fraction(x,df_temp):
    
    floor_x = int( np.floor(x) )
    return df_temp.index[floor_x]+(x-floor_x)*(df_temp.index[floor_x+1]-df_temp.index[floor_x])


def detect_peaks(df_rolling,prominence,width):

    df_temp = df_rolling.copy().dropna()
    peaks, properties = find_peaks(df_temp, prominence=prominence, width=width)
    properties["prominences"], properties["widths"]

    x= df_temp.index[peaks]
    y= df_temp[peaks].values

    outliers = pd.DataFrame(properties)
    outliers.index = x

    outliers["left_ips"]  = outliers["left_ips"].apply(lambda x: date_fraction(x,df_temp))
    outliers["right_ips"] = outliers["right_ips"].apply(lambda x: date_fraction(x,df_temp))

    aggregate_to_the_hour_fmt = "%Y-%m-%d:%H"
    aggregate_to_the_day_fmt = "%Y-%m-%d"
    fmt = aggregate_to_the_hour_fmt

    outliers["right_ips"] = outliers["right_ips"].apply(lambda x:pd.to_datetime(x.strftime(fmt),format=fmt) if pd.notnull(x) else np.nan)
    outliers["left_ips"] = outliers["left_ips"].apply(lambda x:pd.to_datetime(x.strftime(fmt),format=fmt) if pd.notnull(x) else np.nan)

    outliers.index = outliers.reset_index()["date"].apply(lambda x:pd.to_datetime(x.strftime(fmt),format=fmt) if pd.notnull(x) else np.nan)
    outliers = outliers.rename(columns = {"left_ips":"start","right_ips":"stop"})

    return x,y,outliers

def get_bokeh_source(flat_df_num,min_sample_size=100,var_name=""):

    flat_df_bokeh = flat_df_num.copy()

    # aggregate dates to the level you wish
    aggregate_to_the_hour_fmt = "%Y-%m-%d:%H"
    aggregate_to_the_day_fmt = "%Y-%m-%d"
    fmt = aggregate_to_the_day_fmt
    flat_df_bokeh["date"] = pd.to_datetime(flat_df_bokeh["date"])
    flat_df_bokeh["shortdate"] = pd.to_datetime(flat_df_bokeh["date"].apply(lambda x:x.strftime(fmt) if pd.notnull(x) else np.nan),
                                        format=fmt)

    flat_df_bokeh["date"] = list(zip(flat_df_bokeh["date"].apply(lambda x: x.timestamp() if pd.notnull(x) else np.nan),
                                     flat_df_num["wt"])) # turn into numeric timestamp
    
    df = flat_df_bokeh.groupby(['shortdate','party'])["date",var_name+"_wts",var_name+"_dk"].agg( lambda x: weighted_mean(x) ).reset_index()
    df["date"] = df["date"].apply(lambda x: datetime.fromtimestamp(x,tz=pytz.timezone('GMT')) if pd.notnull(x) else np.nan  ) # turn back from timestamp
    
    df["N"] = flat_df_bokeh.groupby(['shortdate','party']).count().reset_index()[var_name+"num"]
    df["wt_mean"] = flat_df_bokeh.groupby(['shortdate','party']).mean().reset_index()["wt"]*100/2 # "natural" position at 50%
    df["wt_std"] = flat_df_bokeh.groupby(['shortdate','party']).std().reset_index()["wt"]*100/3
    df["dk"] = df[var_name+"_dk"]*100 # % less confusing than fraction
    df["stdev"] = flat_df_bokeh.groupby(['shortdate','party'])[var_name+"num"].std().reset_index()[var_name+"num"]
    df = df[ df["N"]>=min_sample_size ]
    # plot in bokeh so we can explore!
    df.index = df["date"]
    df = df.rename(columns={var_name+"_wts":var_name+"num"}) # ,"shortdate":"date"
    df = df.pivot(index = "date",columns="party", values=[var_name+'num', 'dk', 'N', 'stdev','wt_mean','wt_std'])
    df.columns = ["_".join(x).strip("_") for x in df.columns]
    df = df.reset_index()
    party_list = flat_df_bokeh["party"].unique()
    
    return df,party_list



def bokeh_time(dtstr):
#     return pd.to_datetime(dtstr).value / 1e6
      return pd.to_datetime(dtstr).timestamp() * 1000

def bokeh_explorer_charts(x_axis_label,y_axis_label,color_dict,title_dict,var_name,x='date',min_sample_size=0,
                          plot_width=490,plot_height=300,dk=True,wts_charts=False,
                          tools='box_select,pan,xwheel_zoom,box_zoom,reset',active_drag='pan',active_scroll='xwheel_zoom',
                          prominence=.15,width=20,rolling_win_type = 'blackmanharris',window_size=250):


    y_axis_label_dk = "&tri=daily-mean-wt/squ=daily-std-wt"
    wave_gap_days_max =2
    lowest_alpha=0.3
    low_alpha=0.5    
    date_dict = {"22 May 2014":"EE","7 May 2015":"GE","23 June 2016":"EU",
                 "8 June 2017":"GE","23 May 2019":"EE","21 April 2016":"OB",
                 "9 October 2014":"CL","12 December 2019":"GE",
                 "3 March 2015":"CAMP", "23 April 2017":"CAMP", "6 November 2019":"CAMP","15 April 2016":"CAMP"}
    date_duration_dict = {"22 May 2014":1,"7 May 2015":1,"23 June 2016":1,
                 "8 June 2017":1,"23 May 2019":1,"21 April 2016":2,
                 "9 October 2014":1,"12 December 2019":1,
                 "3 March 2015":37, "23 April 2017": 45, "6 November 2019":35,"15 April 2016":68 }
    
    date_col_dict = {"EE":'green',"GE":'red',"EU":'blue',"OB":'pink',"CL":'purple',"CAMP":'orange'}        
    
    df,party_list =  get_bokeh_source(flat_df_num, min_sample_size=min_sample_size, var_name=var_name)
    party_list = list( color_dict.keys() )    
    for party in party_list:
        df["scaled_N_"+party] = df["N_"+party].apply(lambda x: np.log2(x+1))/2
    source = ColumnDataSource( df )    


    p = {}
    p_dk = {}
    tool_dict = {}
    outlier_dict = {}
#     rolling_win_type = 'BES'

    
    

    for party in party_list:

        title = title_dict[party]
        color = color_dict[party]
        y = var_name+'num_'+party    

        tool_dict[party]=\
            HoverTool(names=['daily_aggregates'],tooltips = [
                      ('Date','@date{%d/%m/%y}'),
            #                               ('Date','@wave'),
                      ('Mean','@'+var_name+'num_'+party),
                      ('N','@N_'+party),
                      ("SD",'@stdev_'+party),
                      ("DK%",'@dk_'+party),
                     ],
                formatters={'date': 'datetime'},
                # display a tooltip whenever the cursor is vertically in line with a glyph
#                 mode='vline'
                     )


        p[party]= figure(x_axis_type="datetime", x_axis_label = x_axis_label,
                    y_axis_label = y_axis_label,
                    plot_width = plot_width, plot_height=plot_height,
                    title = title, tools = tools, active_drag = active_drag, active_scroll=active_scroll)
        p[party].title.text_font_size = '8pt'

        df_rolling = rolling_av( party, var_name+"num" ,win_type=rolling_win_type,window_size=window_size )
        
        
        # remove the lines connnecting differen waves!
        inferred_wave_gaps = (df_rolling.reset_index()["date"].diff()>timedelta(days=wave_gap_days_max)).astype('int').cumsum()

        if not np.isnan(inferred_wave_gaps.max()):
            for inf_wave in range(0,inferred_wave_gaps.max()+1):   
                rolling_source = ColumnDataSource(pd.DataFrame(df_rolling[df_rolling.index[inferred_wave_gaps==inf_wave]]))
                p[party].line(x,var_name+"num", source=rolling_source, color=color)


        x_out,y_out,outliers = detect_peaks(df_rolling,prominence,width)
        p[party].x(x_out,y_out,color = 'green',size=20, alpha=0.5)

        p[party].circle(x,y, source=source, color=color,name='daily_aggregates', size='scaled_N_'+party,
                        fill_alpha=0.0,line_width=1)
        p[party].add_tools(tool_dict[party])
        outliers.name = party
        outlier_dict[party] = outliers

         
                             

        if dk:
            df_rolling = rolling_av( party, var_name+"dk",win_type=rolling_win_type,window_size=window_size )*100

            p_dk[party]= figure(x_axis_type="datetime",x_axis_label = x_axis_label,
                            y_axis_label=y_axis_label_dk,
                            plot_width = plot_width, plot_height=plot_height,
                            title="".join(title.split(" (")[:-1]+[" (% Don't know)"]),
                            tools=tools, active_drag=active_drag, active_scroll=active_scroll)
            p_dk[party].title.text_font_size = '8pt'

            p_dk[party].circle('date', 'dk_'+party, source=source, color=color, name='daily_aggregates',
                                size='scaled_N_'+party, fill_alpha=0.0,line_width=1)
            if wts_charts:
                p_dk[party].triangle('date', 'wt_mean_'+party, source=source, color=color, 
                                    size='scaled_N_'+party, fill_alpha=0.0,line_width=1)    
                p_dk[party].square('date', 'wt_std_'+party, source=source, color=color, 
                                    size='scaled_N_'+party, fill_alpha=0.0,line_width=1) 
            
            p_dk[party].add_tools( tool_dict[party] )

            inferred_wave_gaps = (df_rolling.reset_index()["date"].diff()>timedelta(days=wave_gap_days_max)).astype('int').cumsum()
            if not np.isnan(inferred_wave_gaps.max()):
                for inf_wave in range(0,inferred_wave_gaps.max()+1):   
                    rolling_source = ColumnDataSource(pd.DataFrame(df_rolling[df_rolling.index[inferred_wave_gaps==inf_wave]]))
                    p_dk[party].line(x,var_name+"dk", source=rolling_source, color=color)

            x_out,y_out,outliers = detect_peaks(df_rolling,prominence*100,width)
            p_dk[party].x(x_out,y_out,color = 'green',size=20, alpha=0.5)
   
        for date in date_dict.keys():       
            
            if flat_df_num["wave_startpoint"].min()<pd.to_datetime(date) and flat_df_num["wave_endpoint"].max()>pd.to_datetime(date):
                
                my_label = Label(x=pd.to_datetime(date), y=200 + (date_duration_dict[date]>2)*40,
                                 y_units='screen', text=date_dict[date])
                p[party].add_layout(my_label) 
            
                box_left = pd.to_datetime(date)
                box_right = pd.to_datetime(date)+timedelta(days=date_duration_dict[date])

                box = BoxAnnotation(left=box_left, right=box_right,
                                    line_width=1, line_color='black', line_dash='dashed',
                                    fill_alpha=0.2, fill_color=date_col_dict[date_dict[date]])

                p[party].add_layout(box)   
                if dk:
                    p_dk[party].add_layout(my_label) 
                    p_dk[party].add_layout(box) 

    for pno in range(0,len(party_list)):
        for pno2 in range(0,len(party_list)):
            if pno==pno2:
                continue
            p[party_list[pno]].x_range = p[party_list[pno2]].x_range
            p[party_list[pno2]].x_range = p[party_list[pno]].x_range

            if dk:
                p[party_list[pno2]].x_range = p_dk[party_list[pno2]].x_range
                p_dk[party_list[pno]].x_range = p_dk[party_list[pno2]].x_range

    if len(party_list)>1:
        p[party_list[1]].x_range = p[party_list[0]].x_range                
                
    if dk:
        p[party_list[0]].x_range = p_dk[party_list[0]].x_range
        lay = layout( [[y for y in x] for x in list(zip(p.values(),p_dk.values()))] )
    else:
        lay = layout( list(p.values()) )



    show(lay)

    for party in party_list:
        print(party)
        display(outlier_dict[party])
        
    return df,df_rolling

In [57]:

wave_to_date[17] = "Nov-19"
wave_to_date[18] = "Nov-19"
wave_to_date[19] = "Dec-19"

In [147]:

use_BES_weights = True
rolling_win_type = "BES"

In [59]:

# use_BES_weights = False
# rolling_win_type = 'blackmanharris'

In [ ]:

In [60]:

# search(BES_Panel , "past")

In [61]:

# Con2019 = BES_Panel["p_past_vote_2019"]=="Conservative"
# ConBefore2019 = (BES_Panel[["p_past_vote_2015","p_past_vote_2017",]]=="Conservative").any(axis=1)
# mask = (Con2019&(~ConBefore2019))
# mask.sum()

Out[61]:

In [148]:

Con2019 = BES_Panel["p_past_vote_2019"]=="Conservative"
ConBefore2019 = (BES_Panel[["p_past_vote_2005","p_past_vote_2010","p_past_vote_2015","p_past_vote_2017",]]=="Conservative").any(axis=1)
mask = (Con2019&(~ConBefore2019))
mask.sum()

Out[148]:

How the entire electorate's feelings about the Conservative Party changed over time¶

In [63]:

%%time
var_name = "like"
title= "\n".join(["How much do you like or dislike each of the following parties?",
                  "Strongly dislike (0) - Strongly like (1) (11 step scale)"])
specific_suffix_set = "(Con|Lab|LD|SNP|PC|UKIP|Grn|TIG|BNP|BrexitParty)"
specific_suffix_set = "(Con)"
# 
col_wrap = 2
height   = 6

flat_df_num = plot_time_series(var_name=var_name,specific_suffix_set=specific_suffix_set,title=title,col_name="party",
                               col_wrap=col_wrap,treatment="immig_timeseries",max_wave=max_wave,use_BES_weights=use_BES_weights,
#                                mask = mask
                              )
                               #,use_BES_weights=True)

Wall time: 3min 1s

Now just the people who voted Conservative in 2019 but not before¶

(includes people who didn't vote/were too young to vote before)¶

In [149]:

%%time
var_name = "like"
title= "\n".join(["How much do you like or dislike each of the following parties?",
                  "Strongly dislike (0) - Strongly like (1) (11 step scale)"])
specific_suffix_set = "(Con|Lab|LD|SNP|PC|UKIP|Grn|TIG|BNP|BrexitParty)"
specific_suffix_set = "(Con)"
# 
col_wrap = 1
height   = 10

flat_df_num = plot_time_series(var_name=var_name,specific_suffix_set=specific_suffix_set,title=title,col_name="party",
                               col_wrap=col_wrap,treatment="immig_timeseries",max_wave=max_wave,use_BES_weights=use_BES_weights,
                               mask = mask,n_boot=1000,
                              )
                               #,use_BES_weights=True)

Wall time: 26.1 s

How those same people feel about all the parties¶

In [150]:

%%time
var_name = "like"
title= "\n".join(["How much do you like or dislike each of the following parties?",
                  "Strongly dislike (0) - Strongly like (1) (11 step scale)"])
specific_suffix_set = "(Con|Lab|LD|SNP|PC|UKIP|Grn|TIG|BNP|BrexitParty)"
# specific_suffix_set = "(Con)"
# 
col_wrap = 3
height   = 4

flat_df_num = plot_time_series(var_name=var_name,specific_suffix_set=specific_suffix_set,title=title,col_name="party",
                               col_wrap=col_wrap,treatment="immig_timeseries",max_wave=max_wave,use_BES_weights=use_BES_weights,
                               mask = mask,n_boot=1000,aspect=1.15,
                              )
                               #,use_BES_weights=True)

Wall time: 2min 56s

If you're wondering if those people are pretty Brexity - they are¶

In [151]:

BES_Panel["p_eurefvote"][mask].value_counts()

Out[151]:

Leave the EU             1438
Stay/remain in the EU     239
Don't know                  4
Name: p_eurefvote, dtype: int64

But what if the shift happens within the BES waves?¶

Here's a visualisation that lets you navigate the timeline and look at the patterns within waves¶

(mousewheel lets you zoom into/out of the horizontal scale)¶

In [152]:

%%time

y_axis_label = "circles=daily-mean/line=moving average"
x_axis_label = "Response Date"


suff_list = specific_suffix_set.replace(")","").replace("(","").split("|")
color_dict = dict(zip(suff_list,colour_list))
title_dict = dict(zip(suff_list,["How much do you like (1) or dislike (0) each of the following parties? "+"("+x+")?" for x in suff_list]))

prominence=.15
width=20

df,df_rolling = bokeh_explorer_charts(x_axis_label,y_axis_label,color_dict,title_dict, var_name,
                    prominence=prominence,width=width, rolling_win_type=rolling_win_type)

Con

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Lab

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

LD

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

SNP

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date
2016-06-08 12:00:00	0.187318	38	1946	171.440036	0.33495	2015-05-18 04:00:00	2016-11-26 20:00:00

PC

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

UKIP

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date
2016-12-09 10:00:00	0.162743	22	3233	98.646447	0.519319	2016-12-03 16:00:00	2017-04-24 16:00:00

Grn

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

TIG

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

BNP

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

BrexitParty

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date
2019-06-08 18:00:00	0.228943	61	671	314.194225	0.58046	2019-05-25 09:00:00	2019-11-02 11:00:00

Wall time: 58.6 s

In [ ]:

In [153]:

# %%time
var_name = "like"
title= "\n".join(["How much do you like or dislike each of the following politicians?",
                  "Strongly dislike (0) - Strongly like (1) (11 step scale)"])
specific_suffix_set = "(AlanJohnson|Bartley|Batten|Bennett|Berger|Berry|Blair|Johnson|Cable|Cameron|Clegg|Corbyn|Davey|Davidson|Davis|Farage|Farron|Gove|Harvie|Hunt|Jarvis|Javid|Leonard|LongBailey|Lucas|May|McDonnell|Miliband|Mogg|Moran|Nuttall|Osborne|Phillips|Price|Rayner|Rennie|Rudd|SEvans|Salmond|Soubry|Starmer|Sturgeon|Swinson|Umunna|Watson|Wood)"
# 
col_wrap = 3
height   = 4

flat_df_num = plot_time_series(var_name=var_name,specific_suffix_set=specific_suffix_set,title=title,col_name="party",
                               col_wrap=col_wrap,treatment="immig_timeseries",max_wave=max_wave,use_BES_weights=use_BES_weights,
                               mask = mask,n_boot=1000,min_waves_included=1,aspect=1.15)

In [154]:

%%time
var_name = "like"
title= "\n".join(["How much do you like or dislike each of the following politicians?",
                  "Strongly dislike (0) - Strongly like (1) (11 step scale)"])
specific_suffix_set = "(Cameron|May|Johnson|Miliband|Corbyn|Starmer|Farage|Nuttall|Batten|Clegg|Farron|Cable|Swinson|Mogg|Davis)"

# 
col_wrap = 3
height   = 4

flat_df_num = plot_time_series(var_name=var_name,specific_suffix_set=specific_suffix_set,title=title,col_name="party",
                               col_wrap=col_wrap,treatment="immig_timeseries",max_wave=max_wave,use_BES_weights=use_BES_weights,
                               mask = mask,n_boot=1000,min_waves_included=1,aspect=1.15,
                               col_order=specific_suffix_set.replace(")","").replace("(","").split("|") )

Wall time: 2min 34s

In [ ]:

In [155]:

%%time

y_axis_label = "circles=daily-mean/line=moving average"
x_axis_label = "Response Date"


suff_list = specific_suffix_set.replace(")","").replace("(","").split("|")
color_dict = dict(zip(suff_list,cycle(colour_list)))
title_dict = dict(zip(suff_list,["How much do you like (1) or dislike (0) each of the following politicians? "+"("+x+")?" for x in suff_list]))

prominence=.15
width=20
                                 
df,df_rolling = bokeh_explorer_charts(x_axis_label,y_axis_label,color_dict,title_dict, var_name,
                    prominence=prominence,width=width, rolling_win_type=rolling_win_type)

Cameron

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

May

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date
2017-04-27 21:00:00	0.232222	0	956	648.520808	0.47566	2016-11-25 19:00:00	2017-05-26 08:00:00

Johnson

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Miliband

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Corbyn

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date
2017-06-13 21:00:00	0.223164	792	3411	269.595143	0.351213	2017-05-28 16:00:00	2017-06-19 03:00:00

Starmer

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Farage

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date
2019-06-01 11:00:00	0.190262	77	3234	311.648179	0.531275	2019-05-25 09:00:00	2019-11-01 18:00:00

Nuttall

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Batten

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Clegg

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Farron

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Cable

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Swinson

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Mogg

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Davis

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Wall time: 49.4 s

In [ ]:

In [156]:

%%time
var_name = "ptv"
title= "\n".join(["How likely is it that you would ever vote for each of the following parties?",
                  "Strongly dislike (0) - Strongly like (1) (11 step scale)"])
specific_suffix_set = "(Con|Lab|LD|SNP|PC|UKIP|Grn|TIG|BNP|BrexitParty)"
# specific_suffix_set = "(Con)"
# 
col_wrap = 3
height   = 4

flat_df_num = plot_time_series(var_name=var_name,specific_suffix_set=specific_suffix_set,title=title,col_name="party",
                               col_wrap=col_wrap,treatment="ptv_timeseries",max_wave=max_wave,use_BES_weights=use_BES_weights,
                               mask = mask,n_boot=1000,aspect=1.15,
                              )
                               #,use_BES_weights=True)

Wall time: 1min 42s

In [175]:

%%time
var_name = "ptv"
title= "\n".join(["How likely is it that you would ever vote for each of the following parties?",
                  "Strongly dislike (0) - Strongly like (1) (11 step scale)"])
specific_suffix_set = "(Con|Lab|UKIP|BrexitParty)"
# specific_suffix_set = "(Con)"
# 
col_wrap = 2
height   = 6

flat_df_num = plot_time_series(var_name=var_name,specific_suffix_set=specific_suffix_set,title=title,col_name="party",
                               col_wrap=col_wrap,treatment="ptv_timeseries",max_wave=max_wave,use_BES_weights=use_BES_weights,
                               mask = mask,n_boot=1000,aspect=1,
                              )
                               #,use_BES_weights=True)

Wall time: 30.3 s

In [158]:

%%time

y_axis_label = "circles=daily-mean/line=moving average"
x_axis_label = "Response Date"


suff_list = specific_suffix_set.replace(")","").replace("(","").split("|")
color_dict = dict(zip(suff_list,colour_list))
title_dict = dict(zip(suff_list,["How likely is it that you would ever vote for each of the following parties? "+"("+x+")?" for x in suff_list]))

prominence=.15
width=20
                                 
df,df_rolling = bokeh_explorer_charts(x_axis_label,y_axis_label,color_dict,title_dict, var_name,
                    prominence=prominence,width=width, rolling_win_type=rolling_win_type)

Con

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date
2017-04-29 06:00:00	0.164285	43	2079	367.941196	0.349820	2016-12-07 18:00:00	2017-06-11 11:00:00
2019-03-24 18:00:00	0.172839	43	2422	83.635815	0.357156	2019-03-23 08:00:00	2019-05-24 18:00:00

Lab

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date
2015-03-26 09:00:00	0.153435	157	2380	453.355989	0.558378	2014-09-25 00:00:00	2015-05-05 13:00:00
2017-06-15 16:00:00	0.225008	1486	2380	168.897410	0.452872	2017-06-07 09:00:00	2017-06-22 08:00:00

UKIP

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date
2016-12-09 03:00:00	0.173288	7	2446	100.546041	0.49352	2016-12-03 11:00:00	2017-04-24 16:00:00

BrexitParty

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date
2019-06-05 23:00:00	0.25383	74	601	219.73232	0.551031	2019-05-26 17:00:00	2019-11-02 11:00:00

Wall time: 22.8 s

In [ ]:

Repeat the previous analysis but with a tight definition of "First Time Conservative"¶

Only people for which we definitely have their 05/10/15/17 votes¶

And only those over 32 in 2019 who could have voted 05-17¶

Will also drop chart panels where nothing much interesting is happening¶

In [177]:

Con2019 = BES_Panel["p_past_vote_2019"]=="Conservative"

past_elections = ["p_past_vote_2005","p_past_vote_2010","p_past_vote_2015","p_past_vote_2017",]
# past_elections = ["p_past_vote_2015","p_past_vote_2017",]

ConBefore2019 = (BES_Panel[past_elections]=="Conservative").any(axis=1)
mask = Con2019&(~ConBefore2019)&BES_Panel[past_elections].notnull().all(axis=1)&(BES_Panel["ageW19"]>32)
mask.sum()

Out[177]:

In [178]:

# %%time
# var_name = "like"
# title= "\n".join(["How much do you like or dislike each of the following parties?",
#                   "Strongly dislike (0) - Strongly like (1) (11 step scale)"])
# specific_suffix_set = "(Con|Lab|LD|UKIP|Grn|TIG|BNP|BrexitParty)"
# # specific_suffix_set = "(Con)"
# # 
# col_wrap = 3
# height   = 4

# flat_df_num = plot_time_series(var_name=var_name,specific_suffix_set=specific_suffix_set,title=title,col_name="party",
#                                col_wrap=col_wrap,treatment="immig_timeseries",max_wave=max_wave,use_BES_weights=use_BES_weights,
#                                mask = mask,n_boot=1000,aspect=1.15,
#                               )
#                                #,use_BES_weights=True)

In [174]:

%%time
var_name = "like"
title= "\n".join(["How much do you like or dislike each of the following parties?",
                  "Strongly dislike (0) - Strongly like (1) (11 step scale)"])
specific_suffix_set = "(Con|Lab|UKIP|BrexitParty)"
# specific_suffix_set = "(Con)"
# 
col_wrap = 2
height   = 6

flat_df_num = plot_time_series(var_name=var_name,specific_suffix_set=specific_suffix_set,title=title,col_name="party",
                               col_wrap=col_wrap,treatment="immig_timeseries",max_wave=max_wave,use_BES_weights=use_BES_weights,
                               mask = mask,n_boot=1000,aspect=1,
                              )
                               #,use_BES_weights=True)

Wall time: 44 s

In [162]:

%%time

y_axis_label = "circles=daily-mean/line=moving average"
x_axis_label = "Response Date"


suff_list = specific_suffix_set.replace(")","").replace("(","").split("|")
color_dict = dict(zip(suff_list,colour_list))
title_dict = dict(zip(suff_list,["How much do you like (1) or dislike (0) each of the following parties? "+"("+x+")?" for x in suff_list]))

prominence=.15
width=20
                                 
df,df_rolling = bokeh_explorer_charts(x_axis_label,y_axis_label,color_dict,title_dict, var_name,
                    prominence=prominence,width=width, rolling_win_type=rolling_win_type)

Con

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date
2017-04-29 17:00:00	0.175515	602	2498	601.566841	0.340948	2016-11-29 17:00:00	2017-06-11 17:00:00

Lab

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

UKIP

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date
2016-12-06 20:00:00	0.181613	1139	2535	125.318765	0.487641	2016-12-02 11:00:00	2017-04-26 14:00:00

BrexitParty

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date
2019-06-06 08:00:00	0.213349	15	929	200.536001	0.607138	2019-05-26 13:00:00	2019-11-02 08:00:00

Wall time: 22.6 s

In [ ]:

In [179]:

# %%time
# var_name = "ptv"
# title= "\n".join(["How likely is it that you would ever vote for each of the following parties?",
#                   "Strongly dislike (0) - Strongly like (1) (11 step scale)"])
# specific_suffix_set = "(Con|Lab|LD|UKIP|Grn|TIG|BNP|BrexitParty)"
# # specific_suffix_set = "(Con)"
# # 
# col_wrap = 3
# height   = 4

# flat_df_num = plot_time_series(var_name=var_name,specific_suffix_set=specific_suffix_set,title=title,col_name="party",
#                                col_wrap=col_wrap,treatment="ptv_timeseries",max_wave=max_wave,use_BES_weights=use_BES_weights,
#                                mask = mask,n_boot=1000,aspect=1.15,
#                               )
#                                #,use_BES_weights=True)

In [173]:

%%time
var_name = "ptv"
title= "\n".join(["How likely is it that you would ever vote for each of the following parties?",
                  "Strongly dislike (0) - Strongly like (1) (11 step scale)"])
specific_suffix_set = "(Con|Lab|UKIP|BrexitParty)"
# specific_suffix_set = "(Con)"
# 
col_wrap = 2
height   = 6

flat_df_num = plot_time_series(var_name=var_name,specific_suffix_set=specific_suffix_set,title=title,col_name="party",
                               col_wrap=col_wrap,treatment="ptv_timeseries",max_wave=max_wave,use_BES_weights=use_BES_weights,
                               mask = mask,n_boot=1000,aspect=1.0,
                              )

Wall time: 30.6 s

In [165]:

%%time

y_axis_label = "circles=daily-mean/line=moving average"
x_axis_label = "Response Date"


suff_list = specific_suffix_set.replace(")","").replace("(","").split("|")
color_dict = dict(zip(suff_list,colour_list))
title_dict = dict(zip(suff_list,["How likely is it that you would ever vote for each of the following parties? "+"("+x+")?" for x in suff_list]))

prominence=.15
width=20
                                 
df,df_rolling = bokeh_explorer_charts(x_axis_label,y_axis_label,color_dict,title_dict, var_name,
                    prominence=prominence,width=width, rolling_win_type=rolling_win_type)

Con

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date
2017-05-08 14:00:00	0.191952	160	1864	298.622834	0.310604	2016-12-07 10:00:00	2017-06-11 21:00:00

Lab

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date
2017-06-13 19:00:00	0.256408	1168	1883	200.950379	0.488014	2017-05-21 13:00:00	2017-06-23 10:00:00

UKIP

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date
2016-12-07 08:00:00	0.190825	521	1866	111.181031	0.484992	2016-12-02 15:00:00	2017-04-25 18:00:00
2019-03-20 10:00:00	0.177635	1480	1866	125.419047	0.388092	2019-03-14 09:00:00	2019-03-24 04:00:00

BrexitParty

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date
2019-06-02 09:00:00	0.161121	16	463	119.629049	0.614889	2019-05-28 19:00:00	2019-11-02 08:00:00

Wall time: 16.5 s

In [ ]:

In [180]:

# %%time
# var_name = "like"
# title= "\n".join(["How much do you like or dislike each of the following politicians?",
#                   "Strongly dislike (0) - Strongly like (1) (11 step scale)"])
# specific_suffix_set = "(AlanJohnson|Bartley|Batten|Bennett|Berger|Berry|Blair|Johnson|Cable|Cameron|Clegg|Corbyn|Davey|Davidson|Davis|Farage|Farron|Gove|Harvie|Hunt|Jarvis|Javid|Leonard|LongBailey|Lucas|May|McDonnell|Miliband|Mogg|Moran|Nuttall|Osborne|Phillips|Price|Rayner|Rennie|Rudd|SEvans|Salmond|Soubry|Starmer|Sturgeon|Swinson|Umunna|Watson|Wood)"
# # 
# col_wrap = 3
# height   = 4

# flat_df_num = plot_time_series(var_name=var_name,specific_suffix_set=specific_suffix_set,title=title,col_name="party",
#                                col_wrap=col_wrap,treatment="immig_timeseries",max_wave=max_wave,use_BES_weights=use_BES_weights,
#                                mask = mask,n_boot=1000,min_waves_included=1,aspect=1.15,)

In [167]:

%%time
var_name = "like"
title= "\n".join(["How much do you like or dislike each of the following politicians?",
                  "Strongly dislike (0) - Strongly like (1) (11 step scale)"])
specific_suffix_set = "(Cameron|May|Johnson|Miliband|Corbyn|Starmer|Farage|Nuttall|Batten|Clegg|Farron|Cable|Swinson|Mogg|Davis)"

# 
col_wrap = 3
height   = 4

flat_df_num = plot_time_series(var_name=var_name,specific_suffix_set=specific_suffix_set,title=title,col_name="party",
                               col_wrap=col_wrap,treatment="immig_timeseries",max_wave=max_wave,use_BES_weights=use_BES_weights,
                               mask = mask,n_boot=1000,min_waves_included=1,aspect=1.15,
                               col_order=specific_suffix_set.replace(")","").replace("(","").split("|") )

Wall time: 1min 19s

In [172]:

%%time

y_axis_label = "circles=daily-mean/line=moving average"
x_axis_label = "Response Date"


suff_list = specific_suffix_set.replace(")","").replace("(","").split("|")
color_dict = dict(zip(suff_list,cycle(colour_list)))
title_dict = dict(zip(suff_list,["How much do you like (1) or dislike (0) each of the following politicians? "+"("+x+")?" for x in suff_list]))

prominence=.15
width=20
                                 
df,df_rolling = bokeh_explorer_charts(x_axis_label,y_axis_label,color_dict,title_dict, var_name,
                    prominence=prominence,width=width, rolling_win_type=rolling_win_type)

Cameron

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

May

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date
2017-04-27 09:00:00	0.230738	0	941	498.04603	0.480187	2016-11-26 21:00:00	2017-05-25 12:00:00

Johnson

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Miliband

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Corbyn

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date
2017-06-13 18:00:00	0.25448	541	2655	275.503552	0.371878	2017-05-25 12:00:00	2017-06-22 19:00:00

Starmer

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Farage

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Nuttall

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Batten

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Clegg

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Farron

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Cable

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Swinson

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Mogg

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Davis

	prominences	left_bases	right_bases	widths	width_heights	start	stop
date

Wall time: 36.7 s

In [ ]:

First Time Conservatives Attitudes to Parties&Party Leaders (For Heinz Brandenburg)

How the entire electorate's feelings about the Conservative Party changed over time¶

Now just the people who voted Conservative in 2019 but not before¶

(includes people who didn't vote/were too young to vote before)¶

How those same people feel about all the parties¶

If you're wondering if those people are pretty Brexity - they are¶

But what if the shift happens within the BES waves?¶

Here's a visualisation that lets you navigate the timeline and look at the patterns within waves¶

(mousewheel lets you zoom into/out of the horizontal scale)¶

Repeat the previous analysis but with a tight definition of "First Time Conservative"¶

Only people for which we definitely have their 05/10/15/17 votes¶

And only those over 32 in 2019 who could have voted 05-17¶

Will also drop chart panels where nothing much interesting is happening¶

Comments