import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.io as pio
import plotly.express as px
import ipywidgets as widgets
import wrds
from jupyter_dash import JupyterDash
from dash import html, dcc
from dash.dependencies import Input, Output
pd.options.display.max_colwidth = 200
pd.set_option('display.max_columns', None)
conn=wrds.Connection()
WRDS recommends setting up a .pgpass file.
Created .pgpass file successfully. Loading library list... Done
df= conn.raw_sql(f"""select conm, gvkey, tic, fyear, at, lt, ni, oancf, sale, sich, prcc_f * csho as mv
from comp.funda a
where a.fyear >= 1990 and
a.fyear <= 2020 and
a.indfmt= 'INDL' and
a.datafmt = 'STD' and
a.popsrc= 'D' and
a.consol = 'C'
""")
df=df.dropna(subset='sich')
df= df.sort_values(['gvkey','fyear'])
df['fyear']= df['fyear'].astype(int)
df['lev']= df['lt'] / df['at']
df= df.reset_index(drop=True)
print(df.shape)
df.head()
(254976, 12)
conm | gvkey | tic | fyear | at | lt | ni | oancf | sale | sich | mv | lev | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AAR CORP | 001004 | AIR | 1990 | 379.958 | 186.180 | 14.801 | 36.891 | 466.542 | 5080.0 | 224.460360 | 0.490002 |
1 | AAR CORP | 001004 | AIR | 1991 | 395.351 | 198.614 | 10.020 | 8.814 | 422.657 | 5080.0 | 204.699595 | 0.502374 |
2 | AAR CORP | 001004 | AIR | 1992 | 365.151 | 175.935 | 0.283 | 16.806 | 382.780 | 5080.0 | 214.663477 | 0.481814 |
3 | AAR CORP | 001004 | AIR | 1993 | 417.626 | 228.138 | 9.494 | 6.697 | 407.754 | 5080.0 | 228.648720 | 0.546273 |
4 | AAR CORP | 001004 | AIR | 1994 | 425.814 | 228.695 | 10.463 | 15.255 | 451.395 | 5080.0 | 243.405235 | 0.537077 |
df['sic2']= (df['sich']/100).astype(int)
df['lag_at']= df.groupby('gvkey')['at'].shift(1)
df['ROA']= df['ni'] / df['lag_at']
df['CFO']= df['oancf'] / df['lag_at']
df['accruals']= (df['ni']-df['oancf']) / df['lag_at']
print(df.shape)
df.head(3)
(254976, 17)
conm | gvkey | tic | fyear | at | lt | ni | oancf | sale | sich | mv | lev | sic2 | lag_at | ROA | CFO | accruals | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AAR CORP | 001004 | AIR | 1990 | 379.958 | 186.180 | 14.801 | 36.891 | 466.542 | 5080.0 | 224.460360 | 0.490002 | 50 | NaN | NaN | NaN | NaN |
1 | AAR CORP | 001004 | AIR | 1991 | 395.351 | 198.614 | 10.020 | 8.814 | 422.657 | 5080.0 | 204.699595 | 0.502374 | 50 | 379.958 | 0.026371 | 0.023197 | 0.003174 |
2 | AAR CORP | 001004 | AIR | 1992 | 365.151 | 175.935 | 0.283 | 16.806 | 382.780 | 5080.0 | 214.663477 | 0.481814 | 50 | 395.351 | 0.000716 | 0.042509 | -0.041793 |
df.dropna(inplace=True)
df= df[df['ROA'].abs()<0.5]
df= df[df['sale']>0]
df= df[df['at']>0]
df= df[df['mv']>0]
df['log_at']=np.log10(df['at'])
df['log_sale']=np.log10(df['sale'])
df['log_mv']=np.log10(df['mv'])
df.shape
(172303, 20)
df.loc[df.sic2 <= 9, 'industry']= 'Agriculture, Forest, Fishing'
df.loc[(df.sic2 >= 10) & (df.sic2 <= 14), 'industry']= 'Mining'
df.loc[(df.sic2 >= 15) & (df.sic2 <= 17), 'industry']= 'Construction'
df.loc[(df.sic2 >= 20) & (df.sic2 <= 39), 'industry']= 'Manufacturing'
df.loc[(df.sic2 >= 40) & (df.sic2 <= 49), 'industry']= 'Trans. & Public Utilities'
df.loc[(df.sic2 >= 50) & (df.sic2 <= 51), 'industry']= 'Wholesale Trade'
df.loc[(df.sic2 >= 52) & (df.sic2 <= 59), 'industry']= 'Retail Trade'
df.loc[(df.sic2 >= 60) & (df.sic2 <= 67), 'industry']= 'Finance, Insurance, Real Estate'
df.loc[(df.sic2 >= 70) & (df.sic2 <= 89), 'industry']= 'Services'
df.loc[(df.sic2 >= 91) & (df.sic2 <= 99), 'industry']= 'Public Administration'
df=df[(df.sic2<60) | (df.sic2>=70)]
print(df.shape)
(140403, 21)
df=df[df.fyear>=1995]
df_N= df.groupby('gvkey').agg({'gvkey':'count', 'industry':'nunique'}).rename(columns={'gvkey':'N', 'industry':'ind_count'}).reset_index()
df_N.head()
gvkey | N | ind_count | |
---|---|---|---|
0 | 001004 | 26 | 1 |
1 | 001013 | 15 | 1 |
2 | 001019 | 21 | 1 |
3 | 001021 | 14 | 1 |
4 | 001034 | 13 | 1 |
df_to_filter= df_N[(df_N.ind_count==1) & (df_N.N==df_N.N.max())]
balance_gvkey_list= list(df_to_filter.gvkey)
len(balance_gvkey_list)
795
df= df[df.gvkey.isin(balance_gvkey_list)]
df.shape
(20670, 21)
df.fyear.value_counts().sort_index().plot(kind='bar')
print(f"Unique firms: {df.gvkey.nunique()}")
plt.show()
Unique firms: 795
Balanced panel is confirmed.
df[df.fyear==2020].value_counts('industry')
industry Manufacturing 456 Trans. & Public Utilities 117 Services 80 Retail Trade 56 Mining 36 Wholesale Trade 33 Construction 16 Agriculture, Forest, Fishing 1 dtype: int64
df=df[df['industry'].isin(['Services', 'Retail Trade', 'Manufacturing'])]
df_2020= df[df.fyear==2020]
gvkey_100_list= list(df_2020.sort_values('mv', ascending= False).head(100)['gvkey'])
df=df[df.gvkey.isin(gvkey_100_list)]
df.shape
(2600, 21)
df[['at','lt','mv','sale','ni']]= df[['at','lt','mv','sale','ni']]*1000*1000
df[['ROA', 'CFO', 'accruals']].describe()
ROA | CFO | accruals | |
---|---|---|---|
count | 2600.000000 | 2600.000000 | 2600.000000 |
mean | 0.093176 | 0.143236 | -0.050060 |
std | 0.075244 | 0.078937 | 0.055847 |
min | -0.478937 | -0.149743 | -0.565054 |
25% | 0.052462 | 0.092544 | -0.072813 |
50% | 0.085562 | 0.131595 | -0.046237 |
75% | 0.131053 | 0.181850 | -0.024897 |
max | 0.444756 | 0.553114 | 0.321131 |
xcol='sale'
ycol='ROA'
xcol_label= 'SALES'
ycol_label= 'ROA'
bubble_size_base= 'mv'
df['bubble_size']= df[bubble_size_base].clip(df[bubble_size_base].quantile(0.1),)
bubble_size_label= 'market value'
fig=px.scatter(df, x=xcol, y=ycol,
animation_frame='fyear',
animation_group='conm',
height= 600, width= 800,
range_x=[min(df[xcol]), max(df[xcol])],
range_y=[min(df[ycol]), max(df[ycol])],
size='bubble_size',
size_max= 50,
color='industry',
log_x= True,
log_y= False,
hover_name= 'conm',
labels={xcol:xcol_label, ycol:ycol_label, 'industry':''}
)
# global set-up
fig.update_layout(
font_family="Times New Roman",
font_size=15)
# title set-up
fig.update_layout(
title={
'text': f"<b>Dynamic scatter plot: {xcol_label} & {ycol_label} with {bubble_size_label} as a bubble size</b>",
'font_size':25,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
legend=dict(yanchor='top',
y=0.99,
xanchor='left',
x=0.01,
font_size=13,
))
fig.show()
fig.write_html('../_includes/dynamic-scatter-plot.html', auto_play= False)
app = JupyterDash(__name__)
app.layout= html.Div([
html.H1('Dynamic Scatter Plots with market value as a bubble size'),
html.Div([dcc.Dropdown(placeholder="x-variable:", id= "xvar-dropdown", value= 'log_at', options= ['at', 'sale','log_at', 'log_sale']),
dcc.Dropdown(id= "yvar-dropdown", value= 'log_mv', options= ['mv', 'log_mv'])],
style= {'width':'30%'}),
dcc.Graph(id='graph')
])
@app.callback(
Output("graph", "figure"),
[Input("xvar-dropdown", "value"), Input("yvar-dropdown", "value")]
)
def scatter_plot(xvar, yvar):
xcol= xvar
ycol= yvar
# Define size variable to be used as a bubble size
bubble_size_base= 'mv'
fig=px.scatter(df, x=xcol, y=ycol,
animation_frame='fyear',
animation_group='conm',
height= 600, width= 800,
range_x=[min(df[xcol]), max(df[xcol])],
range_y=[min(df[ycol]), max(df[ycol])],
size='bubble_size',
size_max= 50,
color='industry',
hover_name= 'conm',
labels={'industry':''}
)
# global set-up
fig.update_layout(
font_family="Times New Roman",
font_size=15)
# title set-up
fig.update_layout(
title={
'text': f"<b>Dynamic Scatter Plots: {xcol} & {ycol} with {bubble_size_base} as a bubble size</b>",
'font_size':25,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
legend=dict(yanchor='top',
y=0.99,
xanchor='left',
x=0.01,
font_size=13,
))
return fig
app.run_server(mode='inline')
! jupyter nbconvert compustat-bubble-plot-animation.ipynb --to html
[NbConvertApp] Converting notebook compustat-bubble-plot-animation.ipynb to html [NbConvertApp] Writing 642893 bytes to compustat-bubble-plot-animation.html