import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.express as px
pd.options.display.max_columns=200
import ipywidgets as widgets
df= pd.read_csv(r'data\aa_auditopin.csv')
df.head(3)
audit_op_key | afk | afn | gco | af_ctry | fy | form_fkey | cik | tic | big5 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 66900 | 5 | Arthur Andersen LLP | 0 | USA | 2000 | 10-K | 20 | NaN | 1 |
1 | 78252 | 5 | Arthur Andersen LLP | 0 | USA | 2001 | 10-K405 | 20 | NaN | 1 |
2 | 3538 | 4 | KPMG LLP | 0 | USA | 2002 | 10-K | 20 | NaN | 1 |
df_b6= df[df.afk<=6]
df_b6=df_b6[['afn', 'fy']]
df_b6.head()
afn | fy | |
---|---|---|
0 | Arthur Andersen LLP | 2000 |
1 | Arthur Andersen LLP | 2001 |
2 | KPMG LLP | 2002 |
3 | Grant Thornton LLP | 2003 |
4 | Grant Thornton LLP | 2004 |
df_cross= pd.crosstab(df_b6.fy, df_b6.afn).reset_index()
df_cross.head()
afn | fy | Arthur Andersen LLP | Deloitte & Touche LLP | Ernst & Young LLP | Grant Thornton LLP | KPMG LLP | PricewaterhouseCoopers LLP |
---|---|---|---|---|---|---|---|
0 | 2000 | 2549 | 2884 | 3306 | 981 | 1998 | 3098 |
1 | 2001 | 2357 | 2724 | 2662 | 987 | 2154 | 2973 |
2 | 2002 | 196 | 3177 | 2863 | 955 | 2284 | 2875 |
3 | 2003 | 0 | 3232 | 3079 | 926 | 2295 | 3300 |
4 | 2004 | 0 | 2698 | 2785 | 1164 | 2175 | 2714 |
df_cross_melted= pd.melt(df_cross, id_vars='fy')
df_cross_melted.rename(columns={'value':'count'}, inplace= True)
df_cross_melted.head()
fy | afn | count | |
---|---|---|---|
0 | 2000 | Arthur Andersen LLP | 2549 |
1 | 2001 | Arthur Andersen LLP | 2357 |
2 | 2002 | Arthur Andersen LLP | 196 |
3 | 2003 | Arthur Andersen LLP | 0 |
4 | 2004 | Arthur Andersen LLP | 0 |
df_cross_melted['bubble_size']= df_cross_melted['count']
df_cross_melted.loc[df_cross_melted.bubble_size<=300, 'bubble_size']=300
df_cross_melted.head()
fy | afn | count | bubble_size | |
---|---|---|---|---|
0 | 2000 | Arthur Andersen LLP | 2549 | 2549 |
1 | 2001 | Arthur Andersen LLP | 2357 | 2357 |
2 | 2002 | Arthur Andersen LLP | 196 | 300 |
3 | 2003 | Arthur Andersen LLP | 0 | 300 |
4 | 2004 | Arthur Andersen LLP | 0 | 300 |
df_cross_melted.afn= df_cross_melted.afn.str.replace(' LLP', '')
df_cross_melted.loc[df_cross_melted.afn == 'PricewaterhouseCoopers', 'afn']='PwC'
df_cross_melted.loc[df_cross_melted.afn == 'Ernst & Young', 'afn']='EY'
df_cross_melted.loc[df_cross_melted.afn == 'Deloitte & Touche', 'afn']='Deloitte'
fig=px.scatter(df_cross_melted, x='fy', y='count',
hover_name='afn',
color='afn',
size='bubble_size',
size_max=30,
range_x= [2000,2020],
range_y=[-100,3500],
height=600,
width= 1000,
labels=dict(fy='Fiscal Year', count='# of Clients', afn='')
)
# global set-up
fig.update_layout(
font_family="Times New Roman",
font_size=15)
# title set-up
fig.update_layout(
title={
'text': "<b>The Number of Clients for Big 6 Audit Firms</b>",
'font_size':25,
'x':0.45,
'xanchor': 'center',
'yanchor': 'top'})
# xticks set-up
fig.update_xaxes(
dtick=1)
fig.show()
fig=px.scatter(df_cross_melted, x='fy', y='count',
hover_name='afn',
animation_frame='fy',
animation_group='afn',
color='afn',
size='bubble_size',
size_max=30,
range_x= [2000,2020],
range_y=[-100,4000],
height=600,
width= 800,
labels=dict(fy='Fiscal Year', count='# of Clients', afn='')
)
# global set-up
fig.update_layout(
font_family="Times New Roman",
font_size=15)
# title set-up
fig.update_layout(
title={
'text': "<b>The Number of clients for big 6 audit firms</b>",
'font_size':25,
'x':0.45,
'xanchor': 'center',
'yanchor': 'top'},
legend=dict(yanchor='top',
y=0.99,
xanchor='right',
x=0.99,
font_size=13,
))
# xticks set-up
fig.update_xaxes(
dtick=5)
fig.show()
fig.write_html('..\_includes\client-dist-figure.html', auto_play= False)
! jupyter nbconvert audit-analytics-client-distribution.ipynb --to html
[NbConvertApp] Converting notebook audit-analytics-client-distribution.ipynb to html [NbConvertApp] Writing 4343741 bytes to audit-analytics-client-distribution.html