Table
DataFrame 結構
1.由多個變數組成ndarray 2.再由index + ndarray組成Series 3.最後由多個Series組成DataFrame
- DataFrame
- 多個 Series
- index + ndarray
- 多個 scalar(變數)
- index + ndarray
- 多個 Series
DataFrame 基本入門操作
1.select:loc,iloc,ix
2.filter:&,|
3.mutate
4.arrange:df.sort_values
5.summarize:df.describe(),df.info()
6.groupby
7.other:df.idxmax()
1.Dataframe創建
#載入數據
1 2 3 4 5 6 |
import pandas as pd df = pd.read_csv('path') df = pd.read_excel('path') df = pd.read_json('path') df.to_excel('1.xlsx') |
#建立 Dataframe
1 2 3 4 5 6 7 8 |
import pandas as pd stock_info= { "name" :['Steven', 'Peter', 'Max'], "price":[ 300, 400, 500] } stock_df = pd.DataFrame(stock_info) print(stock_df) |
dict 和 list 融合
1 |
dict_crawl = {<br /> 'name': [],<br /> 'date': [],<br /> 'description': []<br />}<br /><br /><br />for i in posts:<br /> dict_crawl['name'].append('Max')<br /> dict_crawl['date'].append('2018-11-05')<br /> dict_crawl['description'].append('dict with list') |
2.Select & 缺失值
# Select
1 2 3 4 5 6 |
#label & positional indexing stock_df.ix[:, "name"] stock_df.ix[:, 1] stock_df.loc[:, "name"] #label based indexing stock_df.iloc[:, 1] #positional indexing |
# iloc[:].values
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
dataset >>> Country Age Salary Purchased 0 France 44.0 72000.0 No 1 Spain 27.0 48000.0 Yes 2 Germany 30.0 54000.0 No 3 Spain 38.0 61000.0 No 4 Germany 40.0 NaN Yes X = dataset.iloc[ : , :-1].values >>>array([['France', 44.0, 72000.0], ['Spain', 27.0, 48000.0], ['Germany', 30.0, 54000.0], ['Spain', 38.0, 61000.0], ['Germany', 40.0, nan], ['France', 35.0, 58000.0], ['Spain', nan, 52000.0], ['France', 48.0, 79000.0], ['Germany', 50.0, 83000.0], ['France', 37.0, 67000.0]], dtype=object) Y = dataset.iloc[ : , 3].values >>>array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'], dtype=object) |
# 篩選DF的型態
1 2 |
df.select_dtypes(include=['float64']) df.select_dtypes(exclude=['int']) |
# 篩選DF的值
1 |
df0_group = df0_group[df0_group['產品'].str.contains("【會員獨享99元】|(訂單加購)")==False] |
# 缺失值
1 2 3 4 5 6 7 8 9 10 11 |
df['name'].isnull() df['name'].notnull() df.isnull().sum() df.isnull().any() df.dropna() # 捨棄缺失值 df.fillna(0) # 用0填補缺失值 df['age'].fillna(df['age'].mean()) # 用平均值填補缺失值 df['age'].fillna(df.groupby('gender')['age'].transform('mean'),inplace=True) df.interpolate() |
3.Dataframe修改
# datatime
1 |
df['日期']=pandas.to_datetime(df['日期'],format = '%Y年%m月%d日') |
# get_dummies
1 2 |
pandas.get_dummies(df['Sex']) df = pandas.concat([df,pandas.get_dummies(df['Sex'])],axis=1) |
# 跳脫符號
1 |
shaq = 'Shaquille O\'Neal' |
# float
1 2 3 4 5 6 7 8 9 10 11 |
pi = 3.14159265359 print("圓周率是 {}".format(pi)) print("圓周率是 {:.2f}".format(pi)) print("圓周率是 {:.4e}".format(pi)) print("圓周率是 {:.0f}".format(pi)) ... 圓周率是 3.14159265359 圓周率是 3.14 圓周率是 3.1416e+00 圓周率是 3 |
# enumerate(sequence, [start=0])
1 2 |
Name = ['Max', 'Steven', 'Peter','Lucy','Annie','Hugo'] list(enumerate(Name)) |
# list comprehension
1 2 |
istances = [10, 21, 42] ans1 = [x/1.609344 for x in distances] |
# functional programming
1 2 |
istances = [10, 21, 42] ans2 = list(map(lambda x : x / 1.609344, distances)) |
4.Dataframe輸出
#輸出
1 2 3 4 |
pList = [] pList.append([post_name,price]) df.columns = ['Name','Price'] df.to_csv('/Users/max/Desktop/Result.csv', index=False) |
#排序
1 2 3 |
df.sort_values(by = "CS", ascending=False) #重置index .reset_index(drop=True) |
5.其他
#隨機沈睡
1 2 3 |
import time import random time.sleep(random.randrange(3, 10)) |
#運行時間
1 2 3 |
start = time.time() end = time.time() print ("完成時間: %f s" % (end - start)) |
#for if 單行運行
1 |
<span class="pun">[</span><span class="pln">word </span><span class="kwd">for</span><span class="pln"> word </span><span class="kwd">in</span><span class="pln"> lst </span><span class="kwd">if</span><span class="pln"> len</span><span class="pun">(</span><span class="pln">word</span><span class="pun">)</span> <span class="pun">></span> <span class="lit">5</span><span class="pun">]</span> |
#list轉換成str
1 2 3 |
mylist = ['max', 'lucy', 'annie'] print (<strong>', '.join</strong>(mylist)) >>>max, lucy, annie |
#try except語法
1 2 3 4 |
try : #your code except Exception: print ("Game Over") |
#行轉列
1 |
df_select.pivot_table(values="被填入的值", index="設定的index",columns="要轉的行名稱") |
#判別字串是否全部是英文
1 2 3 4 5 |
import re #判斷是否為英文 def all_is_english(word): match = re.search('^[a-zA-Z]+ |
#建立資料夾
1 2 3 4 5 6 7 8 9 10 11 12 |
import os def mkdir(path): folder = os.path.exists(path) if not folder: os.makedirs(path) print ("--- new folder... ---") else: print ("--- There is this folder! ---") file = "/Users/max/Desktop/aa" mkdir(file) |
#merge合併三份df
1 |
pd.merge(pd.merge(df1,df2,on='name'),df3,on='name') |
#append合併多份df
1 2 3 4 |
df_0 = pd.DataFrame({}) for i in ID_unique: df_0 = df_0.append(df_done[df_done['交易ID']==i]) |
# list使用方法
1 2 3 4 5 6 7 8 |
append() 方法 insert() 方法 remove() 方法 pop() 方法 index() 方法 count() 方法 sort() 方法 reverse() 方法 #反轉 |
# dictionary使用方法
1 2 3 4 |
get() 方法 keys() 方法 items() 方法 values() 方法 |
# 將小數點轉換成百分數
1 |
format('1.22','.0%') |
# 將 df 的小數點轉換成百分數
1 |
df['uniquePurchases'].apply(lambda x: format(x, '.2%')) |
延伸閱讀:
那 [Python教學] 寫給Python初學者的入門操作 就到這邊,感謝收看,有關Max行銷誌的最新文章,都會發佈在Max的Facebook粉絲專頁,如果想看最新更新,還請您按讚或是追蹤唷!