1. 데이터 분석 준비

1) 패키지 가져오기

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
import missingno as msno
# Imputer was deprecated 3 versions ago and remove in 0.22
# from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

pd.set_option('display.max_columns', 100)

2) 데이터 가져오기

trainset = pd.read_csv('../data/porto-seguro-safe-driver-prediction/train.csv')
testset = pd.read_csv('../data/porto-seguro-safe-driver-prediction/test.csv')

trainset.describe()

2. 데이터 설명

몇 개의 그룹이 정의되어 있고 이러한 그룹에 속하는 특징에는 이름(ind, reg, car, calc)의 패턴이 포함되어 있다.
- ind는 개인을 나타내며, reg은 등록이고, car는 스스로 설명하며, calc은 계산된 필드를 나타낸다.
bin은 이항 feature로 사용된다.
cat는 범주형 feature로 사용된다.
bin 또는 cat 표시가 없는 feature는 정수값이다. (연속형이거나 순서형이거나)
결측값은 -1로 표시된다.
예측의 대상이 되는 값은 target column에 있다. 이것은 피보험자에 대한 청구 여부를 나타낸다.
- 운전자가 내년에 보험 청구를 할 것인지 예측하는 대회!
- target = 1 : 보험을 청구한다.
- target = 0 : 보험을 청구하지 않는다.
id는 데이터 입력 순서 번호다.

trainset.head()

print("Train dataset (row, cols):", trainset.shape, "\nTestdataset (row, cols):", testset.shape)

Train dataset (row, cols): (595212, 59) 
Testdataset (row, cols): (892816, 58)

# 우리가 구해야할 것은 testset target 컬럼에 무슨 값이 들어갈지 예측하는 것.
print("Columns in train and not in test dataset:", set(trainset.columns) - set(testset.columns))

Columns in train and not in test dataset: {'target'}

3. Metadata 소개

데이터 조작을 용이하게 하기 위해, 우리는 열차 집합의 변수와 몇 개의 메타 정보를 연관시킬 것이다. 이렇게 하면 분석, 검사 또는 모델링을 위한 다양한 유형의 feature의 선택이 용이해진다. car, ind, reg, calc 타입의 특징들을 위한 카테고리 필드를 사용하고 있다.

use: input, ID, target
type: nominal, interval, ordinal, binary
preserve: True or False
dataType: int, float, char
category: ind, reg, car, calc

1) Metadata dataframe 생성¶

data = []
for feature in trainset.columns:
    if feature == 'target':
        use = 'target'
    elif feature == 'id':
        use = 'id'
    else:
        use = 'input'

    if 'bin' in feature or feature == 'target' :
        type = 'binary'
    elif 'cat' in feature or feature == 'id':
        type = 'categorical'
    elif trainset[feature].dtype == 'float64' or isinstance(trainset[feature].dtype, float):
        type = 'real'
    elif trainset[feature].dtype == 'int64':
        type = 'integer'


    preserve = True
    if feature == 'id':
        preserve = False

    dtype = trainset[feature].dtype

    category = 'none'

    if 'ind' in feature:
        category = 'individual'

    elif 'reg' in feature:
        category = 'registration'

    elif 'car' in feature:
        category = 'car'

    elif 'calc' in feature:
        category = 'calculated'

    feature_dictionary = {
        'varname' : feature,
        'use' : use,
        'type' : type,
        'preserve' : preserve,
        'dtype' : dtype,
        'category' : category
    }
    data.append(feature_dictionary)

metadata = pd.DataFrame(data, columns=['varname', 'use','type', 'preserve',
                                       'dtype', 'category'])
metadata.set_index('varname', inplace=True)
metadata

2) Metadata 데이터 분포 확인

# 카테고리 변수만 추출
metadata[(metadata.type == 'categorical') & (metadata.preserve)].index

Index(['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat',
       'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat',
       'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
       'ps_car_10_cat', 'ps_car_11_cat'],
      dtype='object', name='varname')

# 카테고리 변수중 unique 한 값들의 갯수
pd.DataFrame({'count' : metadata.groupby(['category'])['category'].size()}).reset_index()

# use와 type 변수중 unique 한 값들의 갯수
pd.DataFrame({'count' : metadata.groupby(['use' ,'type'])['category'].size()}).reset_index()

	id	target	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_05_cat	ps_ind_06_bin	ps_ind_07_bin	ps_ind_08_bin	ps_ind_09_bin	ps_ind_10_bin	ps_ind_11_bin	ps_ind_12_bin	ps_ind_13_bin	ps_ind_14	ps_ind_15	ps_ind_16_bin	ps_ind_17_bin	ps_ind_18_bin	ps_reg_01	ps_reg_02	ps_reg_03	ps_car_01_cat	ps_car_02_cat	ps_car_03_cat	ps_car_04_cat	ps_car_05_cat	ps_car_06_cat	ps_car_07_cat	ps_car_08_cat	ps_car_09_cat	ps_car_10_cat	ps_car_11_cat	ps_car_11	ps_car_12	ps_car_13	ps_car_14	ps_car_15	ps_calc_01	ps_calc_02	ps_calc_03	ps_calc_04	ps_calc_05	ps_calc_06	ps_calc_07	ps_calc_08	ps_calc_09	ps_calc_10	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_15_bin	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_19_bin	ps_calc_20_bin
count	5.952120e+05	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000
mean	7.438036e+05	0.036448	1.900378	1.358943	4.423318	0.416794	0.405188	0.393742	0.257033	0.163921	0.185304	0.000373	0.001692	0.009439	0.000948	0.012451	7.299922	0.660823	0.121081	0.153446	0.610991	0.439184	0.551102	8.295933	0.829931	-0.504899	0.725192	-0.157732	6.555340	0.910027	0.832080	1.328890	0.992136	62.215674	2.346072	0.379945	0.813265	0.276256	3.065899	0.449756	0.449589	0.449849	2.372081	1.885886	7.689445	3.005823	9.225904	2.339034	8.433590	5.441382	1.441918	2.872288	7.539026	0.122427	0.627840	0.554182	0.287182	0.349024	0.153318
std	4.293678e+05	0.187401	1.983789	0.664594	2.699902	0.493311	1.350642	0.488579	0.436998	0.370205	0.388544	0.019309	0.041097	0.096693	0.030768	0.127545	3.546042	0.473430	0.326222	0.360417	0.287643	0.404264	0.793506	2.508270	0.375716	0.788654	2.153463	0.844417	5.501445	0.347106	0.373796	0.978747	0.091619	33.012455	0.832548	0.058327	0.224588	0.357154	0.731366	0.287198	0.286893	0.287153	1.117219	1.134927	1.334312	1.414564	1.459672	1.246949	2.904597	2.332871	1.202963	1.694887	2.746652	0.327779	0.483381	0.497056	0.452447	0.476662	0.360295
min	7.000000e+00	0.000000	0.000000	-1.000000	0.000000	-1.000000	-1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-1.000000	-1.000000	-1.000000	-1.000000	0.000000	-1.000000	0.000000	-1.000000	0.000000	-1.000000	0.000000	1.000000	-1.000000	-1.000000	0.250619	-1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	3.719915e+05	0.000000	0.000000	1.000000	2.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	5.000000	0.000000	0.000000	0.000000	0.400000	0.200000	0.525000	7.000000	1.000000	-1.000000	0.000000	-1.000000	1.000000	1.000000	1.000000	0.000000	1.000000	32.000000	2.000000	0.316228	0.670867	0.333167	2.828427	0.200000	0.200000	0.200000	2.000000	1.000000	7.000000	2.000000	8.000000	1.000000	6.000000	4.000000	1.000000	2.000000	6.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	7.435475e+05	0.000000	1.000000	1.000000	4.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	7.000000	1.000000	0.000000	0.000000	0.700000	0.300000	0.720677	7.000000	1.000000	-1.000000	0.000000	0.000000	7.000000	1.000000	1.000000	2.000000	1.000000	65.000000	3.000000	0.374166	0.765811	0.368782	3.316625	0.500000	0.400000	0.500000	2.000000	2.000000	8.000000	3.000000	9.000000	2.000000	8.000000	5.000000	1.000000	3.000000	7.000000	0.000000	1.000000	1.000000	0.000000	0.000000	0.000000
75%	1.115549e+06	0.000000	3.000000	2.000000	6.000000	1.000000	0.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	10.000000	1.000000	0.000000	0.000000	0.900000	0.600000	1.000000	11.000000	1.000000	0.000000	0.000000	1.000000	11.000000	1.000000	1.000000	2.000000	1.000000	93.000000	3.000000	0.400000	0.906190	0.396485	3.605551	0.700000	0.700000	0.700000	3.000000	3.000000	9.000000	4.000000	10.000000	3.000000	10.000000	7.000000	2.000000	4.000000	9.000000	0.000000	1.000000	1.000000	1.000000	1.000000	0.000000
max	1.488027e+06	1.000000	7.000000	4.000000	11.000000	1.000000	6.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	4.000000	13.000000	1.000000	1.000000	1.000000	0.900000	1.800000	4.037945	11.000000	1.000000	1.000000	9.000000	1.000000	17.000000	1.000000	1.000000	4.000000	2.000000	104.000000	3.000000	1.264911	3.720626	0.636396	3.741657	0.900000	0.900000	0.900000	5.000000	6.000000	10.000000	9.000000	12.000000	7.000000	25.000000	19.000000	10.000000	13.000000	23.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000

	id	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_06_bin	ps_ind_07_bin	ps_ind_08_bin	ps_ind_15	ps_ind_16_bin	ps_ind_17_bin	ps_ind_18_bin	ps_reg_01	ps_reg_02	ps_reg_03	ps_car_01_cat	ps_car_02_cat	ps_car_03_cat	ps_car_05_cat	ps_car_06_cat	ps_car_07_cat	ps_car_08_cat	ps_car_09_cat	ps_car_10_cat	ps_car_11_cat	ps_car_11	ps_car_12	ps_car_13	ps_car_14	ps_car_15	ps_calc_01	ps_calc_02	ps_calc_03	ps_calc_04	ps_calc_05	ps_calc_06	ps_calc_07	ps_calc_08	ps_calc_09	ps_calc_10	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_19_bin	ps_calc_20_bin
0	7	2	2	5	1	0	1	0	11	0	1	0	0.7	0.2	0.718070	10	1	-1	1	4	1	0	0	1	12	2	0.400000	0.883679	0.370810	3.605551	0.6	0.5	0.2	3	1	10	1	10	1	5	9	1	5	8	1	1	0	0	1
1	9	1	1	7	0	0	0	1	3	0	0	1	0.8	0.4	0.766078	11	1	-1	-1	11	1	1	2	1	19	3	0.316228	0.618817	0.388716	2.449490	0.3	0.1	0.3	2	1	9	5	8	1	7	3	1	1	9	1	1	0	1	0
2	13	5	4	9	1	0	0	1	12	1	0	0	0.0	0.0	-1.000000	7	1	-1	-1	14	1	1	2	1	60	1	0.316228	0.641586	0.347275	3.316625	0.5	0.7	0.1	2	2	9	1	8	2	7	4	2	7	7	1	1	0	1	0
3	16	0	1	2	0	1	0	0	8	1	0	0	0.9	0.2	0.580948	7	1	0	1	11	1	1	3	1	104	1	0.374166	0.542949	0.294958	2.000000	0.6	0.9	0.1	2	4	7	1	8	4	2	2	2	4	9	0	0	0	0	0
4	17	0	2	0	1	1	0	0	9	1	0	0	0.7	0.6	0.840759	11	1	-1	-1	14	1	1	2	1	82	3	0.316070	0.565832	0.365103	2.000000	0.4	0.6	0.0	2	2	6	3	10	2	12	3	1	1	3	0	0	1	1	0

[kaggle][필사] Porto serqruo safe prediction(Gabriel Preda) (2) (0)	2020.09.08
[kaggle] Porto serqruo safe prediction(Bert Carremans) (2) (0)	2020.09.06
[kaggle] Porto serqruo safe prediction(Bert Carremans) (1) (0)	2020.09.05

춤추는 개발자

[kaggle][필사] Porto serqruo safe prediction(Gabriel Preda) (1)

1. 데이터 분석 준비

1) 패키지 가져오기

2) 데이터 가져오기

2. 데이터 설명

3. Metadata 소개

1) Metadata dataframe 생성¶

2) Metadata 데이터 분포 확인

'Competition > Kaggle' 카테고리의 다른 글

'Competition/Kaggle'의 다른글

티스토리툴바

	use	type	preserve	dtype	category
varname
id	id	categorical	False	int64	none
target	target	binary	True	int64	none
ps_ind_01	input	integer	True	int64	individual
ps_ind_02_cat	input	categorical	True	int64	individual
ps_ind_03	input	integer	True	int64	individual
ps_ind_04_cat	input	categorical	True	int64	individual
ps_ind_05_cat	input	categorical	True	int64	individual
ps_ind_06_bin	input	binary	True	int64	individual
ps_ind_07_bin	input	binary	True	int64	individual
ps_ind_08_bin	input	binary	True	int64	individual
ps_ind_09_bin	input	binary	True	int64	individual
ps_ind_10_bin	input	binary	True	int64	individual
ps_ind_11_bin	input	binary	True	int64	individual
ps_ind_12_bin	input	binary	True	int64	individual
ps_ind_13_bin	input	binary	True	int64	individual
ps_ind_14	input	integer	True	int64	individual
ps_ind_15	input	integer	True	int64	individual
ps_ind_16_bin	input	binary	True	int64	individual
ps_ind_17_bin	input	binary	True	int64	individual
ps_ind_18_bin	input	binary	True	int64	individual
ps_reg_01	input	real	True	float64	registration
ps_reg_02	input	real	True	float64	registration
ps_reg_03	input	real	True	float64	registration
ps_car_01_cat	input	categorical	True	int64	car
ps_car_02_cat	input	categorical	True	int64	car
ps_car_03_cat	input	categorical	True	int64	car
ps_car_04_cat	input	categorical	True	int64	car
ps_car_05_cat	input	categorical	True	int64	car
ps_car_06_cat	input	categorical	True	int64	car
ps_car_07_cat	input	categorical	True	int64	car
ps_car_08_cat	input	categorical	True	int64	car
ps_car_09_cat	input	categorical	True	int64	car
ps_car_10_cat	input	categorical	True	int64	car
ps_car_11_cat	input	categorical	True	int64	car
ps_car_11	input	integer	True	int64	car
ps_car_12	input	real	True	float64	car
ps_car_13	input	real	True	float64	car
ps_car_14	input	real	True	float64	car
ps_car_15	input	real	True	float64	car

	category	count
0	calculated	20
1	car	16
2	individual	18
3	none	2
4	registration	3

	use	type	count
0	id	categorical	1
1	input	binary	17
2	input	categorical	14
3	input	integer	16
4	input	real	10
5	target	binary	1

[kaggle][필사] Porto serqruo safe prediction(Gabriel Preda) (1)

1. 데이터 분석 준비

1) 패키지 가져오기

2) 데이터 가져오기

2. 데이터 설명

3. Metadata 소개

1) Metadata dataframe 생성¶

2) Metadata 데이터 분포 확인

'Competition > Kaggle' 카테고리의 다른 글

'Competition/Kaggle'의 다른글

관련글

티스토리툴바