import json
import requests
import pandas as pd
import numpy as np
import plotly.express as px
import locale
from decimal import Decimal


def _get_br_gov_data(url, filters, page):
    complete_url = url + filters + "&pagina=" + str(page)
    headers = {
      'chave-api-dados': '7048f20892f6e59dc78cf7707e8ad843',
      'Cookie': 'JSESSIONID=358C2ECB5CB444691D06750877E67077'
    }

    response = requests.get(complete_url, headers=headers)
    return response


url = "https://api.portaldatransparencia.gov.br/api-de-dados/cartoes"
filters = "?dataTransacaoFim=30%2F07%2F2023&dataTransacaoInicio=01%2F01%2F2023"

all_data = []
data_by_page =_get_br_gov_data(url, filters, 1).json()
page = 2

while len(data_by_page) > 0:
    all_data.extend(data_by_page)
    data_by_page =_get_br_gov_data(url, filters, page).json()
    if len(data_by_page) == 0:
        print("total pages" + str(page-1))
        break
    all_data.extend(data_by_page)

    page += 1

df_2023 = pd.json_normalize(all_data)
df_2023

total pages375


url = "https://api.portaldatransparencia.gov.br/api-de-dados/cartoes"
filters = "?dataTransacaoFim=30%2F07%2F2022&dataTransacaoInicio=01%2F01%2F2022"

all_data = []
data_by_page = []
data_by_page =_get_br_gov_data(url, filters, 1).json()
page = 2

while len(data_by_page) > 0:
    # print (all_data)
    all_data.extend(data_by_page)
    data_by_page =_get_br_gov_data(url, filters, page).json()
    if len(data_by_page) == 0:
        print("total pages" + str(page-1))
        break
    all_data.extend(data_by_page)
    page += 1

df_2022 = pd.json_normalize(all_data)
df_2022


df_2022 = df_2022.assign(Year=2022)
df_2023 = df_2023.assign(Year=2023)
df = pd.concat([df_2022,df_2023])
df


df = df.drop_duplicates()


print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 52293 entries, 0 to 11209
Data columns (total 7 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   id                               52293 non-null  int64 
 1   mesExtrato                       52293 non-null  object
 2   valorTransacao                   52293 non-null  object
 3   tipoCartao.descricao             52293 non-null  object
 4   unidadeGestora.orgaoMaximo.nome  52293 non-null  object
 5   portador.nome                    52293 non-null  object
 6   Year                             52293 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 3.2+ MB
None


df = df[df['valorTransacao'].str.contains('-')==False]


df = df[['id','mesExtrato','valorTransacao','tipoCartao.descricao', 'unidadeGestora.orgaoMaximo.nome','portador.nome','Year']]


#df['Fee'] = df['Fee'].map(lambda x: x - (x*10/100))


def convert_value_to_float(number):
  locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

  # trocar por 2 replaces
  # replace('.','')
  # replace(',','.')
  
  number = list(number)
  i  = 0
  while i < len(number):
    if number[i] == '.':
      number[i] = ','
    elif number[i] == ',':
      number[i] = '.'
      break
    i +=1

  number = locale.atof(''.join(number))
  return number


def create_dict_with_conv_values(df):
  dict_transactions = dict(zip(df['id'],df['valorTransacao']))
  for k,v in dict_transactions.items():
    dict_transactions.update({k:convert_value_to_float(v)})
  return dict_transactions


def convert_dict_to_df(dict, column1, column2):
    dict_df = pd.DataFrame(
    [(k, val) for k, val in dict.items()], 
    columns=[column1, column2])
    return dict_df


df_converted = convert_dict_to_df(create_dict_with_conv_values(df), 'id', 'valorTransacao')
new_df = (pd.merge(df, df_converted, on='id'))
new_df = new_df.drop(columns='valorTransacao_x')
new_df


new_df = new_df.rename(columns={'mesExtrato':'month','tipoCartao.descricao':'description','valorTransacao_y':'trans_value','unidadeGestora.orgaoMaximo.nome':'gov_name','portador.nome':'person_name'})
new_df


fig_22 = px.histogram(df_2022, x='valorTransacao')
fig_22.show()
fig_23 = px.histogram(df_2023, x='valorTransacao')
fig_23.show()


fig_22 = px.box(df_2022, y='valorTransacao')
fig_22.show()


def find_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   return outliers


df = df_2022
outliers = find_outliers_IQR(df['valorTransacao'])
print('number of outliers: '+ str(len(outliers)))
print('max outlier value: '+ str(outliers.max()))
print('min outlier value: '+ str(outliers.min()))

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[62], line 2
      1 df = df_2022
----> 2 outliers = find_outliers_IQR(df['valorTransacao'])
      3 print('number of outliers: '+ str(len(outliers)))
      4 print('max outlier value: '+ str(outliers.max()))

Cell In[61], line 3, in find_outliers_IQR(df)
      1 def find_outliers_IQR(df):
----> 3    q1=df.quantile(0.25)
      5    q3=df.quantile(0.75)
      7    IQR=q3-q1

File /opt/homebrew/lib/python3.11/site-packages/pandas/core/series.py:2650, in Series.quantile(self, q, interpolation)
   2646 # We dispatch to DataFrame so that core.internals only has to worry
   2647 #  about 2D cases.
   2648 df = self.to_frame()
-> 2650 result = df.quantile(q=q, interpolation=interpolation, numeric_only=False)
   2651 if result.ndim == 2:
   2652     result = result.iloc[:, 0]

File /opt/homebrew/lib/python3.11/site-packages/pandas/core/frame.py:10882, in DataFrame.quantile(self, q, axis, numeric_only, interpolation, method)
  10875 axis = self._get_axis_number(axis)
  10877 if not is_list_like(q):
  10878     # BlockManager.quantile expects listlike, so we wrap and unwrap here
  10879     # error: List item 0 has incompatible type "Union[float, Union[Union[
  10880     # ExtensionArray, ndarray[Any, Any]], Index, Series], Sequence[float]]";
  10881     # expected "float"
> 10882     res_df = self.quantile(  # type: ignore[call-overload]
  10883         [q],
  10884         axis=axis,
  10885         numeric_only=numeric_only,
  10886         interpolation=interpolation,
  10887         method=method,
  10888     )
  10889     if method == "single":
  10890         res = res_df.iloc[0]

File /opt/homebrew/lib/python3.11/site-packages/pandas/core/frame.py:10927, in DataFrame.quantile(self, q, axis, numeric_only, interpolation, method)
  10923     raise ValueError(
  10924         f"Invalid method: {method}. Method must be in {valid_method}."
  10925     )
  10926 if method == "single":
> 10927     res = data._mgr.quantile(qs=q, axis=1, interpolation=interpolation)
  10928 elif method == "table":
  10929     valid_interpolation = {"nearest", "lower", "higher"}

File /opt/homebrew/lib/python3.11/site-packages/pandas/core/internals/managers.py:1587, in BlockManager.quantile(self, qs, axis, interpolation)
   1584 new_axes = list(self.axes)
   1585 new_axes[1] = Index(qs, dtype=np.float64)
-> 1587 blocks = [
   1588     blk.quantile(axis=axis, qs=qs, interpolation=interpolation)
   1589     for blk in self.blocks
   1590 ]
   1592 return type(self)(blocks, new_axes)

File /opt/homebrew/lib/python3.11/site-packages/pandas/core/internals/managers.py:1588, in <listcomp>(.0)
   1584 new_axes = list(self.axes)
   1585 new_axes[1] = Index(qs, dtype=np.float64)
   1587 blocks = [
-> 1588     blk.quantile(axis=axis, qs=qs, interpolation=interpolation)
   1589     for blk in self.blocks
   1590 ]
   1592 return type(self)(blocks, new_axes)

File /opt/homebrew/lib/python3.11/site-packages/pandas/core/internals/blocks.py:1463, in Block.quantile(self, qs, interpolation, axis)
   1460 assert axis == 1  # only ever called this way
   1461 assert is_list_like(qs)  # caller is responsible for this
-> 1463 result = quantile_compat(self.values, np.asarray(qs._values), interpolation)
   1464 # ensure_block_shape needed for cases where we start with EA and result
   1465 #  is ndarray, e.g. IntegerArray, SparseArray
   1466 result = ensure_block_shape(result, ndim=2)

File /opt/homebrew/lib/python3.11/site-packages/pandas/core/array_algos/quantile.py:37, in quantile_compat(values, qs, interpolation)
     35     fill_value = na_value_for_dtype(values.dtype, compat=False)
     36     mask = isna(values)
---> 37     return quantile_with_mask(values, mask, fill_value, qs, interpolation)
     38 else:
     39     return values._quantile(qs, interpolation)

File /opt/homebrew/lib/python3.11/site-packages/pandas/core/array_algos/quantile.py:95, in quantile_with_mask(values, mask, fill_value, qs, interpolation)
     93     result = np.repeat(flat, len(values)).reshape(len(values), len(qs))
     94 else:
---> 95     result = _nanpercentile(
     96         values,
     97         qs * 100.0,
     98         na_value=fill_value,
     99         mask=mask,
    100         interpolation=interpolation,
    101     )
    103     result = np.array(result, copy=False)
    104     result = result.T

File /opt/homebrew/lib/python3.11/site-packages/pandas/core/array_algos/quantile.py:216, in _nanpercentile(values, qs, na_value, mask, interpolation)
    214     return result
    215 else:
--> 216     return np.percentile(
    217         values,
    218         qs,
    219         axis=1,
    220         # error: No overload variant of "percentile" matches argument types
    221         # "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]",
    222         # "int", "Dict[str, str]"  [call-overload]
    223         **{np_percentile_argname: interpolation},  # type: ignore[call-overload]
    224     )

File /opt/homebrew/lib/python3.11/site-packages/numpy/lib/function_base.py:4283, in percentile(a, q, axis, out, overwrite_input, method, keepdims, interpolation)
   4281 if not _quantile_is_valid(q):
   4282     raise ValueError("Percentiles must be in the range [0, 100]")
-> 4283 return _quantile_unchecked(
   4284     a, q, axis, out, overwrite_input, method, keepdims)

File /opt/homebrew/lib/python3.11/site-packages/numpy/lib/function_base.py:4555, in _quantile_unchecked(a, q, axis, out, overwrite_input, method, keepdims)
   4547 def _quantile_unchecked(a,
   4548                         q,
   4549                         axis=None,
   (...)
   4552                         method="linear",
   4553                         keepdims=False):
   4554     """Assumes that q is in [0, 1], and is an ndarray"""
-> 4555     return _ureduce(a,
   4556                     func=_quantile_ureduce_func,
   4557                     q=q,
   4558                     keepdims=keepdims,
   4559                     axis=axis,
   4560                     out=out,
   4561                     overwrite_input=overwrite_input,
   4562                     method=method)

File /opt/homebrew/lib/python3.11/site-packages/numpy/lib/function_base.py:3823, in _ureduce(a, func, keepdims, **kwargs)
   3820             index_out = (0, ) * nd
   3821             kwargs['out'] = out[(Ellipsis, ) + index_out]
-> 3823 r = func(a, **kwargs)
   3825 if out is not None:
   3826     return out

File /opt/homebrew/lib/python3.11/site-packages/numpy/lib/function_base.py:4721, in _quantile_ureduce_func(a, q, axis, out, overwrite_input, method)
   4719     else:
   4720         arr = a.copy()
-> 4721 result = _quantile(arr,
   4722                    quantiles=q,
   4723                    axis=axis,
   4724                    method=method,
   4725                    out=out)
   4726 return result

File /opt/homebrew/lib/python3.11/site-packages/numpy/lib/function_base.py:4840, in _quantile(arr, quantiles, axis, method, out)
   4838     result_shape = virtual_indexes.shape + (1,) * (arr.ndim - 1)
   4839     gamma = gamma.reshape(result_shape)
-> 4840     result = _lerp(previous,
   4841                    next,
   4842                    gamma,
   4843                    out=out)
   4844 if np.any(slices_having_nans):
   4845     if result.ndim == 0 and out is None:
   4846         # can't write to a scalar, but indexing will be correct

File /opt/homebrew/lib/python3.11/site-packages/numpy/lib/function_base.py:4655, in _lerp(a, b, t, out)
   4641 def _lerp(a, b, t, out=None):
   4642     """
   4643     Compute the linear interpolation weighted by gamma on each point of
   4644     two same shape array.
   (...)
   4653         Output array.
   4654     """
-> 4655     diff_b_a = subtract(b, a)
   4656     # asanyarray is a stop-gap until gh-13105
   4657     lerp_interpolation = asanyarray(add(a, diff_b_a * t, out=out))

TypeError: unsupported operand type(s) for -: 'str' and 'str'

	id	mesExtrato	dataTransacao	valorTransacao	tipoCartao.id	tipoCartao.codigo	tipoCartao.descricao	estabelecimento.id	estabelecimento.cpfFormatado	estabelecimento.cnpjFormatado	...	unidadeGestora.orgaoVinculado.codigoSIAFI	unidadeGestora.orgaoVinculado.cnpj	unidadeGestora.orgaoVinculado.sigla	unidadeGestora.orgaoVinculado.nome	unidadeGestora.orgaoMaximo.codigo	unidadeGestora.orgaoMaximo.sigla	unidadeGestora.orgaoMaximo.nome	portador.cpfFormatado	portador.nis	portador.nome
0	346550462	03/2023	06/02/2023	366,00	1	1	Cartão de Pagamento do Governo Federal - CPGF	42308874		03.417.622/0001-08	...	26280	45358058000140	FUF/São Carlos	Fundação Universidade Federal de São Carlos	26000	MEC	Ministério da Educação	*.046.888-		ADEMIR PADILHA ARRUDA JUNIOR
1	346550464	03/2023	13/02/2023	60,00	1	1	Cartão de Pagamento do Governo Federal - CPGF	-2			...	25000	00394460000141	MFAZ	Ministério da Fazenda - Unidades com vínculo d...	25000	MFAZ	Ministério da Fazenda	*.114.656-		GERALDO PAMPLONA FILHO
2	346550465	03/2023	10/02/2023	425,00	1	1	Cartão de Pagamento do Governo Federal - CPGF	-1			...	25205	33787094000140	IBGE	Fundação Instituto Brasileiro de Geografia e E...	47000	MPO	Ministério do Planejamento e Orçamento	*.753.217-		BRUNO BRANDAO PEREIRA MACHADO
3	346550469	03/2023	03/02/2023	340,00	1	1	Cartão de Pagamento do Governo Federal - CPGF	46626839		85.076.198/0001-40	...	30108	00394494000136	PF	Polícia Federal	30000	JUSTIÇA	Ministério da Justiça e Segurança Pública	*.474.899-		LUIZ ADONIS KUHL
4	346550470	03/2023	09/02/2023	1.000,00	1	1	Cartão de Pagamento do Governo Federal - CPGF	-2			...	25205	33787094000140	IBGE	Fundação Instituto Brasileiro de Geografia e E...	47000	MPO	Ministério do Planejamento e Orçamento	*.222.849-		DELMO DE CARVALHO
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
11210	375255596	03/2023	10/02/2023	106,27	1	1	Cartão de Pagamento do Governo Federal - CPGF	6619790		38.616.553/0001-46	...	25205	33787094000140	IBGE	Fundação Instituto Brasileiro de Geografia e E...	47000	MPO	Ministério do Planejamento e Orçamento	*.805.196-		OBEDE EDON DA SILVA
11211	375255601	03/2023	09/02/2023	2.147,47	1	1	Cartão de Pagamento do Governo Federal - CPGF	6019904		07.575.651/0001-59	...	30108	00394494000136	PF	Polícia Federal	30000	JUSTIÇA	Ministério da Justiça e Segurança Pública	*.376.011-		LUCIARA NEVES DE SOUZA
11212	375255602	03/2023	09/02/2023	2.231,76	1	1	Cartão de Pagamento do Governo Federal - CPGF	6019904		07.575.651/0001-59	...	30108	00394494000136	PF	Polícia Federal	30000	JUSTIÇA	Ministério da Justiça e Segurança Pública	*.376.011-		LUCIARA NEVES DE SOUZA
11213	375255603	03/2023	09/02/2023	1.112,73	1	1	Cartão de Pagamento do Governo Federal - CPGF	6019904		07.575.651/0001-59	...	30108	00394494000136	PF	Polícia Federal	30000	JUSTIÇA	Ministério da Justiça e Segurança Pública	*.376.011-		LUCIARA NEVES DE SOUZA
11214	375255604	03/2023	09/02/2023	3.158,83	1	1	Cartão de Pagamento do Governo Federal - CPGF	6019904		07.575.651/0001-59	...	30108	00394494000136	PF	Polícia Federal	30000	JUSTIÇA	Ministério da Justiça e Segurança Pública	*.376.011-		LUCIARA NEVES DE SOUZA

	id	mesExtrato	valorTransacao	tipoCartao.descricao	unidadeGestora.orgaoMaximo.nome	portador.nome	Year
0	346550115	09/2022	139,90	Cartão de Pagamento do Governo Federal - CPGF	Ministério de Minas e Energia	GABRIELA COSTA STOLL	2022
1	346550116	04/2022	1.040,00	Cartão de Pagamento do Governo Federal - CPGF	Ministério do Planejamento e Orçamento	ELIAS ANTONIO DOS SANTOS	2022
2	346550117	06/2022	75,00	Cartão de Pagamento do Governo Federal - CPGF	Ministério da Defesa	DJALMA SILVA DOS SANTOS	2022
3	346550120	07/2022	200,00	Cartão de Pagamento do Governo Federal - CPGF	Ministério do Planejamento e Orçamento	EMIVAL LUDOVINO DE SANTANA	2022
4	346550124	05/2022	202,50	Cartão de Pagamento do Governo Federal - CPGF	Ministério da Fazenda	ANTONIO BARBOSA ABREU JUNIOR	2022
...	...	...	...	...	...	...	...
11205	375255596	03/2023	106,27	Cartão de Pagamento do Governo Federal - CPGF	Ministério do Planejamento e Orçamento	OBEDE EDON DA SILVA	2023
11206	375255601	03/2023	2.147,47	Cartão de Pagamento do Governo Federal - CPGF	Ministério da Justiça e Segurança Pública	LUCIARA NEVES DE SOUZA	2023
11207	375255602	03/2023	2.231,76	Cartão de Pagamento do Governo Federal - CPGF	Ministério da Justiça e Segurança Pública	LUCIARA NEVES DE SOUZA	2023
11208	375255603	03/2023	1.112,73	Cartão de Pagamento do Governo Federal - CPGF	Ministério da Justiça e Segurança Pública	LUCIARA NEVES DE SOUZA	2023
11209	375255604	03/2023	3.158,83	Cartão de Pagamento do Governo Federal - CPGF	Ministério da Justiça e Segurança Pública	LUCIARA NEVES DE SOUZA	2023

	id	mesExtrato	tipoCartao.descricao	unidadeGestora.orgaoMaximo.nome	portador.nome	Year	valorTransacao_y
0	346550115	09/2022	Cartão de Pagamento do Governo Federal - CPGF	Ministério de Minas e Energia	GABRIELA COSTA STOLL	2022	139.90
1	346550116	04/2022	Cartão de Pagamento do Governo Federal - CPGF	Ministério do Planejamento e Orçamento	ELIAS ANTONIO DOS SANTOS	2022	1040.00
2	346550117	06/2022	Cartão de Pagamento do Governo Federal - CPGF	Ministério da Defesa	DJALMA SILVA DOS SANTOS	2022	75.00
3	346550120	07/2022	Cartão de Pagamento do Governo Federal - CPGF	Ministério do Planejamento e Orçamento	EMIVAL LUDOVINO DE SANTANA	2022	200.00
4	346550124	05/2022	Cartão de Pagamento do Governo Federal - CPGF	Ministério da Fazenda	ANTONIO BARBOSA ABREU JUNIOR	2022	202.50
...	...	...	...	...	...	...	...
52288	375255596	03/2023	Cartão de Pagamento do Governo Federal - CPGF	Ministério do Planejamento e Orçamento	OBEDE EDON DA SILVA	2023	106.27
52289	375255601	03/2023	Cartão de Pagamento do Governo Federal - CPGF	Ministério da Justiça e Segurança Pública	LUCIARA NEVES DE SOUZA	2023	2147.47
52290	375255602	03/2023	Cartão de Pagamento do Governo Federal - CPGF	Ministério da Justiça e Segurança Pública	LUCIARA NEVES DE SOUZA	2023	2231.76
52291	375255603	03/2023	Cartão de Pagamento do Governo Federal - CPGF	Ministério da Justiça e Segurança Pública	LUCIARA NEVES DE SOUZA	2023	1112.73
52292	375255604	03/2023	Cartão de Pagamento do Governo Federal - CPGF	Ministério da Justiça e Segurança Pública	LUCIARA NEVES DE SOUZA	2023	3158.83

	id	month	description	gov_name	person_name	Year	trans_value
0	346550115	09/2022	Cartão de Pagamento do Governo Federal - CPGF	Ministério de Minas e Energia	GABRIELA COSTA STOLL	2022	139.90
1	346550116	04/2022	Cartão de Pagamento do Governo Federal - CPGF	Ministério do Planejamento e Orçamento	ELIAS ANTONIO DOS SANTOS	2022	1040.00
2	346550117	06/2022	Cartão de Pagamento do Governo Federal - CPGF	Ministério da Defesa	DJALMA SILVA DOS SANTOS	2022	75.00
3	346550120	07/2022	Cartão de Pagamento do Governo Federal - CPGF	Ministério do Planejamento e Orçamento	EMIVAL LUDOVINO DE SANTANA	2022	200.00
4	346550124	05/2022	Cartão de Pagamento do Governo Federal - CPGF	Ministério da Fazenda	ANTONIO BARBOSA ABREU JUNIOR	2022	202.50
...	...	...	...	...	...	...	...
52288	375255596	03/2023	Cartão de Pagamento do Governo Federal - CPGF	Ministério do Planejamento e Orçamento	OBEDE EDON DA SILVA	2023	106.27
52289	375255601	03/2023	Cartão de Pagamento do Governo Federal - CPGF	Ministério da Justiça e Segurança Pública	LUCIARA NEVES DE SOUZA	2023	2147.47
52290	375255602	03/2023	Cartão de Pagamento do Governo Federal - CPGF	Ministério da Justiça e Segurança Pública	LUCIARA NEVES DE SOUZA	2023	2231.76
52291	375255603	03/2023	Cartão de Pagamento do Governo Federal - CPGF	Ministério da Justiça e Segurança Pública	LUCIARA NEVES DE SOUZA	2023	1112.73
52292	375255604	03/2023	Cartão de Pagamento do Governo Federal - CPGF	Ministério da Justiça e Segurança Pública	LUCIARA NEVES DE SOUZA	2023	3158.83

Project Description and KPIs¶

Case: Analyzing Government Credit Card Expenditures¶

Data 📊 - Description and modeling¶

Step 1: Data Collection¶

Step 2: Data Cleaning, Validation and Inspection¶

Removing Duplicates¶

Identifying/Handling Null and Missing Data¶

Identifying/Handling Negative Values¶

Removing unecessary columns¶

Step 3. Data Transformation¶

Exploratory Data Analysis (IN PROGRESS)¶

Identifying/Handling Outliers¶