import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# 노트북 안에 그래프를 그리기 위해
%matplotlib inline

# 그래프에서 격자로 숫자 범위가 눈에 잘 띄도록 ggplot 스타일을 사용
plt.style.use('ggplot')

# 그래프에서 마이너스 폰트 깨지는 문제에 대한 대처
mpl.rcParams['axes.unicode_minus'] = False

!apt -qq -y install fonts-nanum

# 한글 깨짐 문제 해결  
import matplotlib.font_manager as fm
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic') 
mpl.font_manager._rebuild()

/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm

fonts-nanum is already the newest version (20170925-1).
0 upgraded, 0 newly installed, 0 to remove and 25 not upgraded.

# colab 에서 google drive 접근
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).

bike = pd.read_csv('/content/gdrive/My Drive/data/bicycle-hourtime-201905-test.csv', parse_dates=["대여일자"] ,encoding='cp949')

bike.columns

Index(['대여일자', '요일', '요일_New', '주중주말구분', '대여시간', '대여소번호', '대여소명', '지구',
       '지구_New', '거치대수', '대여구분코드', '대여구분코드_new', '성별', '성별_New', '연령대코드',
       '연령대코드_New', '이용건수', '운동량', '탄소량', '이동거리', '사용시간'],
      dtype='object')

bike["년"] = bike["대여일자"].dt.year
bike["월"] = bike["대여일자"].dt.month
bike["일"] = bike["대여일자"].dt.day

상관관계 분석¶

corrMatt = bike[["요일_New", "대여시간", "대여소번호",  "거치대수", "대여구분코드_new", "성별_New","연령대코드_New","이용건수","사용시간", "지구","주중주말구분"]]
corrMatt = corrMatt.corr()
print(corrMatt)

mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False

              요일_New      대여시간     대여소번호  ...      이용건수      사용시간    주중주말구분
요일_New      1.000000 -0.004396 -0.001738  ... -0.002344 -0.020543  0.063482
대여시간       -0.004396  1.000000 -0.003864  ...  0.102650  0.158501  0.006246
대여소번호      -0.001738 -0.003864  1.000000  ... -0.052148 -0.066636 -0.022332
거치대수       -0.005174  0.026647  0.010230  ...  0.106658  0.106789  0.049121
대여구분코드_new  0.012606 -0.085337  0.032199  ...  0.015866 -0.223490 -0.138814
성별_New      0.002052  0.095304  0.012678  ...  0.007619  0.083457  0.005936
연령대코드_New   0.005822 -0.090134  0.079696  ... -0.129476 -0.065439 -0.050950
이용건수       -0.002344  0.102650 -0.052148  ...  1.000000  0.470068 -0.011573
사용시간       -0.020543  0.158501 -0.066636  ...  0.470068  1.000000  0.087228
주중주말구분      0.063482  0.006246 -0.022332  ... -0.011573  0.087228  1.000000

[10 rows x 10 columns]

fig, ax = plt.subplots()
fig.set_size_inches(20,10)
sns.heatmap(corrMatt, mask=mask,vmax=.8, square=True,annot=True)

<matplotlib.axes._subplots.AxesSubplot at 0x7f7f0fb1dd68>

figure, axes = plt.subplots(ncols=2, nrows=2)
figure.set_size_inches(12, 10)

sns.distplot(bike["이용건수"], ax=axes[0][0])
stats.probplot(bike["이용건수"], dist='norm', fit=True, plot=axes[0][1])
sns.distplot(np.log(trainWithoutOutliers["이용건수"]), ax=axes[1][0])
stats.probplot(np.log1p(trainWithoutOutliers["이용건수"]), dist='norm', fit=True, plot=axes[1][1])

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-70ff6f4c24a0> in <module>()
      4 sns.distplot(bike["이용건수"], ax=axes[0][0])
      5 stats.probplot(bike["이용건수"], dist='norm', fit=True, plot=axes[0][1])
----> 6 sns.distplot(np.log(trainWithoutOutliers["이용건수"]), ax=axes[1][0])
      7 stats.probplot(np.log1p(trainWithoutOutliers["이용건수"]), dist='norm', fit=True, plot=axes[1][1])

NameError: name 'trainWithoutOutliers' is not defined

[python] 공공자전거 데이터 분석(4) - pivot data 생성 (0)	2020.04.28
[python] 공공자전거 데이터 분석(2) - histogram (0)	2020.04.26
[python] 공공자전거 데이터 분석(1) - 데이터 형태, 그래프 출력 (0)	2020.04.25

춤추는 개발자

[python] 공공자전거 데이터 분석(3) - 상관관계 분석

상관관계 분석

상관관계 분석¶

'Study > Data Analysis' 카테고리의 다른 글

'Study/Data Analysis'의 다른글

티스토리툴바

[python] 공공자전거 데이터 분석(3) - 상관관계 분석

상관관계 분석

상관관계 분석¶

'Study > Data Analysis' 카테고리의 다른 글

'Study/Data Analysis'의 다른글

관련글

티스토리툴바