반응형
Pandas¶
In [1]:
import pandas as pd
In [2]:
#갖고오기 (comma ,)
df1 = pd.read_csv('test.csv')
#tab으로 구분
df2 = pd.read_csv('test_tab.txt',delimiter='\t')
#header 없는경우
df3 = pd.read_csv('test_noheader.csv',header=None)
df3.columns = ['a','b','c']
#header 없는경우2
df4 = pd.read_csv('test_noheader.csv',header=None, names=['aa','bb','cc'])
2. Data Frame 만들기¶
2.1 pd.DataFrame.(list)¶
In [3]:
#딕셔너리로 만들기-컬럼순서 보장x
list1 = [
{'name':'amy','age':20,'job':'student'},
{'name':'bob','age':22,'job':'worker'}
]
df5= pd.DataFrame(list1)
df5.head()
Out[3]:
2.2 pd.DataFrame.from_dict(dict)¶
In [4]:
#딕셔너리로 만들기-컬럼순서 보장o
from collections import OrderedDict
ordered_dict=OrderedDict(
[
('name',['john','kate']),
('age',[13,30]),
('job',['student','teacher']),
]
)
df6= pd.DataFrame.from_dict(ordered_dict)
df6
Out[4]:
2.3 pd.DataFrame.from_records(list)¶
In [5]:
#리스트로 만들기-컬럼명 추가
list2 = [
['john',20,'student'],
['kate',30,'teacher']
]
column_name =['aa','bb','cc']
df7=pd.DataFrame.from_records(list2,columns=column_name)
df7
Out[5]:
In [6]:
list3 = [
{'name':'amy','age':20,'job':'student'},
{'name':'bob','age':22,'job':'worker'},
{'name':'cla','age':21,'job':None}
]
df5= pd.DataFrame(list3)
df5= df5[['name','age','job']]
#내보내기
df5.to_csv('file_name.csv')
#index 없이 내보내기
df5.to_csv('file_name2.csv',index=False)
# header 없이 내보내기
df5.to_csv('file_name3.csv',header=False)
# null 필드 - 로 내보내기
df5.to_csv('file_name4.csv',na_rep='-')
4.Data 필터링¶
In [7]:
list4 = [
{'name':'amy','age':20,'job':'student'},
{'name':'bob','age':22,'job':'student'},
{'name':'cla','age':21,'job':'worker'}
]
df6= pd.DataFrame(list4)
4.1 df.loc[조건]¶
In [8]:
# 열 1,2
df7 =df6[1:3]
# 열 1,3
df8 =df6.loc[[0,2]]
4.2 df.query(조건)¶
In [9]:
# 조건1
df9 =df6[df6.age>21]
#조건2
df10=df6.query('age>20')
#조건3
df11=df6[(df6.age>20) & (df6.job=='worker')]
4.3 df.iloc[행조건,열조건]¶
In [10]:
#조건4-행1 인 것의 열 1,2
df12=df6.iloc[0:1,0:2]
#조건5-행전체의 열1,2
df13=df6.iloc[:,0:2]
In [11]:
#조건6-컬럼으로
df14=df6[['name','job']]
#조건7-컬럼으로
df15=df6.filter(items=['age'])
#조건8-컬럼명 like
df16=df6.filter(like='ame',axis=1) #axis=1 컬럼중에서
#조건9-컬럼명에 b(정규식)
df17=df6.filter(regex='b$',axis=1)
In [12]:
friends=[
{'age':15, 'job':'student'},
{'age':25, 'job':'teacher'},
{'age':30, 'job':'worker'},
]
df18= pd.DataFrame(friends, index=['jane','john','kate'],columns=['age','job']) #행은 index, 열은 columns
df18
Out[12]:
In [13]:
df18.drop('jane',inplace=True) #inplace 바로 반영
df18
Out[13]:
In [14]:
df20 = df6
df21 = df20.drop(df20.index[[0,2]])
df23 = df6.drop('age', axis=1) #컬럼 age 삭제
In [15]:
df22 = df20[df20.age>20]
In [16]:
df23['salary'] = 20 #컬럼 salary 추가
In [17]:
import numpy as np
df23['sal_yn']= np.where(df23['job'] == 'worker','y','n')
6.2 df['col'] = df['a']+df['b']¶
In [18]:
score=[
{'name':'stud1', 'mid':70,'final':77},
{'name':'stud2', 'mid':80,'final':81},
{'name':'stud3', 'mid':90,'final':92}
]
df24 = pd.DataFrame(score, columns=['name','mid','final'])
df24['total']= df24['mid']+df24['final']
df24['avg']= df24['total']/2
df24
Out[18]:
6.3 df['col'] = list¶
In [19]:
grades= []
for row in df24['avg']:
if row >= 90:
grades.append('A')
elif row >= 80:
grades.append('B')
else:
grades.append('F')
df24['grade'] =grades
grades
Out[19]:
In [20]:
def pass_or_fail(row):
if row != 'F':
return 'Pass'
else:
return 'Fail'
6.4 df['col'] = df.apply(function)¶
In [21]:
df24['pf'] =df24['grade']
df24.pf = df24.pf.apply(pass_or_fail)
In [22]:
date = [
{'yyyy-mm-dd':'2000-06-27'},
{'yyyy-mm-dd':'2007-10-27'}
]
def extract_year(row):
return row.split('-')[0]
df25= pd.DataFrame(date,columns=['yyyy-mm-dd'])
df25['year']= df25['yyyy-mm-dd'].apply(extract_year)
In [23]:
score=[
{'name':'stud1', 'mid':70,'final':77},
{'name':'stud2', 'mid':80,'final':81}
]
df26 = pd.DataFrame(score, columns=['name','mid','final'])
df26
Out[23]:
In [24]:
df27 = pd.DataFrame([
['stud4',50,60],
['stud5',60,66],
], columns=['name','mid','final'])
df27
Out[24]:
In [25]:
df28 = df26.append(df27, ignore_index=True)
df28
Out[25]:
7.2 pd.concat([df1],[df2])¶
In [26]:
list4=[
{'name':'Ace','job':'student','age':20},
{'name':'Bob','job':'student','age':22},
{'name':'Cho','job':'student','age':24},
{'name':'Dab','job':'student','age':26},
{'name':'Joe','job':'worker','age':40}
]
df39=pd.DataFrame(list4[0:3], columns=['name','job','age'])
df40=pd.DataFrame(list4[3:5], columns=['name','job','age'])
#행 합치기
result = pd.concat([df39,df40],ignore_index=True)
#result2 = df39.append(df40, ignore_index=True)
result
Out[26]:
In [27]:
df41=pd.DataFrame(list4, columns=['name','job'])
df42=pd.DataFrame(list4, columns=['age'])
In [28]:
#열 합치기
result3 = pd.concat([df41,df42],axis=1,ignore_index=True) #열로 넣어라
result3
Out[28]:
7.3 pd.DataFrame(dict)¶
In [29]:
#두 list 합치기
label = [1,2,2,4]
pred = [1,2,4,5]
comparison = pd.DataFrame({'label':label,'pred':pred},columns=['label','pred'])
comparison
Out[29]:
8. DataFrame 그룹바이¶
8.1 df.groupby()¶
In [30]:
df30 = pd.read_csv('student.csv')
groupby_major= df30.groupby('major')
groupby_major.groups
Out[30]:
In [31]:
#보기좋게 출력해보기
for name,group in groupby_major:
print(name,"학과:",str(len(group)),'명')
print(group,'\n')
In [32]:
# 그룹별 데이터 개수
df_major_cnt = pd.DataFrame({'count':groupby_major.size()})
df_major_cnt
Out[32]:
In [33]:
#index 초기화
df_major_cnt = pd.DataFrame({'count':groupby_major.size()}).reset_index()
df_major_cnt
Out[33]:
8.2 df.drop_duplicates() --중복행drop¶
In [34]:
df32 = pd.read_csv('student.csv')
df32.tail()
Out[34]:
In [35]:
df32.duplicated()
Out[35]:
In [36]:
df32.drop_duplicates()
Out[36]:
In [37]:
df32.duplicated(['name'])
Out[37]:
In [38]:
df32.drop_duplicates(['name'])
Out[38]:
9. Null(NaN) 찾기¶
9.1 df.isnull().sum()¶
In [39]:
age_list=[
{'name':'Ace','job':'student','age':20},
{'name':'Bob','job':'student','age':22},
{'name':'Cho','job':'student','age':24},
{'name':'Dab','job':'student','age':26},
{'name':'Emi','job':'student','age':None},
{'name':'Fan','job':'worker','age':None},
{'name':'Geo','job':'worker','age':None},
{'name':'Hye','job':'worker','age':None},
{'name':'Iri','job':'worker','age':30},
{'name':'Joe','job':'worker','age':40}
]
df33=pd.DataFrame(age_list, columns=['name','job','age'])
df33.isnull().sum()
Out[39]:
9.2 df.컬럼.fillna()¶
In [40]:
#unique값 찾기
df33.job.unique()
Out[40]:
In [41]:
#키에 맞는 value건수
df33.job.value_counts()
Out[41]:
In [42]:
#null 을 0으로 채우기
df33.age= df33.age.fillna(0)
df33
Out[42]:
In [43]:
#null을 job그룹의 중앙값으로 처리하기
df34=pd.DataFrame(age_list, columns=['name','job','age'])
df34['age'].fillna(df34.groupby('job')['age'].transform('median'),inplace=True)
df34
Out[43]:
In [44]:
date2 = [
{'yyyy-mm-dd':'2000-06-27'},
{'yyyy-mm-dd':'2003-05-11'},
{'yyyy-mm-dd':'2007-10-27'}
]
df35= pd.DataFrame(date2,columns=['yyyy-mm-dd'])
df35
Out[44]:
In [45]:
def extract_year(row):
return row.split('-')[0]
In [46]:
df35['year']= df35['yyyy-mm-dd'].apply(extract_year)
#df36['year']= df36['yyyy-mm-dd'].map(extract_year) #map 도 같은결과
df35
Out[46]:
In [47]:
def get_age(yyyy,current_yyyy):
return current_yyyy - int(yyyy)
In [48]:
df35['age']= df35['year'].apply(get_age, current_yyyy=2020)
df35
Out[48]:
In [49]:
def extract_d(row,i):
return row.split('-')[i]
In [50]:
df35['month']= df35['yyyy-mm-dd'].apply(extract_d, i=1)
df35['date']= df35['yyyy-mm-dd'].apply(extract_d, i=2)
df35
Out[50]:
In [51]:
def get_intro(age, prefix, suffix):
return prefix+ str(age)+ suffix
In [52]:
df35['intro']= df35['age'].apply(get_intro, prefix='나는 ', suffix='살 입니다')
df35
Out[52]:
10.2 df['col'] = df.apply(function,axis=1)¶
In [53]:
def get_intro2(row):
return "i was born in "+str(row.year)+" my age is "+ str(row.age)
df35['intro2']= df35.apply(get_intro2, axis=1) #row에 있는 모든 컬럼 활용가능
df35
Out[53]:
In [54]:
date2 = [
{'yyyy-mm-dd':'2000-06-27'},
{'yyyy-mm-dd':'2003-05-11'},
{'yyyy-mm-dd':'2007-10-27'}
]
def extract_year(row):
return row.split('-')[0]
df36= pd.DataFrame(date2,columns=['yyyy-mm-dd'])
df36['year']= df36['yyyy-mm-dd'].map(extract_year)
df36
Out[54]:
11.2 df= df['col'].map(dict)¶
In [55]:
list=[
{'name':'Ace','job':'student','age':20},
{'name':'Bob','job':'worker','age':22},
{'name':'Cho','job':'student','age':24}
]
df37=pd.DataFrame(list, columns=['name','job','age'])
df37['job_num'] = df37.job.map({'student':1,'worker':2})
df37
Out[55]:
In [56]:
import numpy as np
list3=[
{'x':1.1,'y':2.2,'z':-2.3},
{'x':-2.1,'y':-5.5,'z':4.5},
{'x':-3.1,'y':1.2,'z':1.9}
]
df38 = pd.DataFrame(list3)
11.3 df= df.applymap()¶
In [57]:
df38= df38.applymap(np.around) #전체 값에 적용
df38
Out[57]:
In [ ]:
반응형
'IT공부 > 파이썬' 카테고리의 다른 글
파이썬 설치 및 시작 (0) | 2020.10.24 |
---|
댓글