文章目錄
DataFrame 結構
1.由多個變數組成ndarray 2.再由index + ndarray組成Series 3.最後由多個Series組成DataFrame
- DataFrame
- 多個 Series
- index + ndarray
- 多個 scalar(變數)
- index + ndarray
- 多個 Series
DataFrame 基本入門操作
1.select:loc,iloc,ix
2.filter:&,|
3.mutate
4.arrange:df.sort_values
5.summarize:df.describe(),df.info()
6.groupby
7.other:df.idxmax()
1.Dataframe創建
#載入數據
import pandas as pd df = pd.read_csv('path') df = pd.read_excel('path') df = pd.read_json('path') df.to_excel('1.xlsx')
#建立 Dataframe
import pandas as pd stock_info= { "name" :['Steven', 'Peter', 'Max'], "price":[ 300, 400, 500] } stock_df = pd.DataFrame(stock_info) print(stock_df)
dict 和 list 融合
dict_crawl = {
'name': [],
'date': [],
'description': []
}
for i in posts:
dict_crawl['name'].append('Max')
dict_crawl['date'].append('2018-11-05')
dict_crawl['description'].append('dict with list')
2.Select & 缺失值
# Select
#label & positional indexing stock_df.ix[:, "name"] stock_df.ix[:, 1] stock_df.loc[:, "name"] #label based indexing stock_df.iloc[:, 1] #positional indexing
# iloc[:].values
dataset >>> Country Age Salary Purchased 0 France 44.0 72000.0 No 1 Spain 27.0 48000.0 Yes 2 Germany 30.0 54000.0 No 3 Spain 38.0 61000.0 No 4 Germany 40.0 NaN Yes X = dataset.iloc[ : , :-1].values >>>array([['France', 44.0, 72000.0], ['Spain', 27.0, 48000.0], ['Germany', 30.0, 54000.0], ['Spain', 38.0, 61000.0], ['Germany', 40.0, nan], ['France', 35.0, 58000.0], ['Spain', nan, 52000.0], ['France', 48.0, 79000.0], ['Germany', 50.0, 83000.0], ['France', 37.0, 67000.0]], dtype=object) Y = dataset.iloc[ : , 3].values >>>array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'], dtype=object)
# 篩選DF的型態
df.select_dtypes(include=['float64']) df.select_dtypes(exclude=['int'])
# 篩選DF的值
df0_group = df0_group[df0_group['產品'].str.contains("【會員獨享99元】|(訂單加購)")==False]
# 缺失值
df['name'].isnull() df['name'].notnull() df.isnull().sum() df.isnull().any() df.dropna() # 捨棄缺失值 df.fillna(0) # 用0填補缺失值 df['age'].fillna(df['age'].mean()) # 用平均值填補缺失值 df['age'].fillna(df.groupby('gender')['age'].transform('mean'),inplace=True) df.interpolate()
3.Dataframe修改
# datatime
df['日期']=pandas.to_datetime(df['日期'],format = '%Y年%m月%d日')
# get_dummies
pandas.get_dummies(df['Sex']) df = pandas.concat([df,pandas.get_dummies(df['Sex'])],axis=1)
# 跳脫符號
shaq = 'Shaquille O\'Neal'
# float
pi = 3.14159265359 print("圓周率是 {}".format(pi)) print("圓周率是 {:.2f}".format(pi)) print("圓周率是 {:.4e}".format(pi)) print("圓周率是 {:.0f}".format(pi)) ... 圓周率是 3.14159265359 圓周率是 3.14 圓周率是 3.1416e+00 圓周率是 3
# enumerate(sequence, [start=0])
Name = ['Max', 'Steven', 'Peter','Lucy','Annie','Hugo'] list(enumerate(Name))
# list comprehension
istances = [10, 21, 42] ans1 = [x/1.609344 for x in distances]
# functional programming
istances = [10, 21, 42] ans2 = list(map(lambda x : x / 1.609344, distances))
4.Dataframe輸出
#輸出
pList = [] pList.append([post_name,price]) df.columns = ['Name','Price'] df.to_csv('/Users/max/Desktop/Result.csv', index=False)
#排序
df.sort_values(by = "CS", ascending=False) #重置index .reset_index(drop=True)
5.其他
#隨機沈睡
import time import random time.sleep(random.randrange(3, 10))
#運行時間
start = time.time() end = time.time() print ("完成時間: %f s" % (end - start))
#for if 單行運行
[word for word in lst if len(word) > 5]
#list轉換成str
mylist = ['max', 'lucy', 'annie'] print (', '.join(mylist)) >>>max, lucy, annie
#try except語法
try : #your code except Exception: print ("Game Over")
#行轉列
df_select.pivot_table(values="被填入的值", index="設定的index",columns="要轉的行名稱")
#判別字串是否全部是英文
import re #判斷是否為英文 def all_is_english(word): match = re.search('^[a-zA-Z]+
#建立資料夾
import os def mkdir(path): folder = os.path.exists(path) if not folder: os.makedirs(path) print ("--- new folder... ---") else: print ("--- There is this folder! ---") file = "/Users/max/Desktop/aa" mkdir(file)
#merge合併三份df
pd.merge(pd.merge(df1,df2,on='name'),df3,on='name')
#append合併多份df
df_0 = pd.DataFrame({}) for i in ID_unique: df_0 = df_0.append(df_done[df_done['交易ID']==i])
# list使用方法
append() 方法 insert() 方法 remove() 方法 pop() 方法 index() 方法 count() 方法 sort() 方法 reverse() 方法 #反轉
# dictionary使用方法
get() 方法 keys() 方法 items() 方法 values() 方法
# 將小數點轉換成百分數
format('1.22','.0%')
# 將 df 的小數點轉換成百分數
df['uniquePurchases'].apply(lambda x: format(x, '.2%'))
延伸閱讀:
那 [Python教學] 寫給Python初學者的入門操作 就到這邊,感謝收看,有關Max行銷誌的最新文章,都會發佈在Max的Facebook粉絲專頁,如果想看最新更新,還請您按讚或是追蹤唷!