development:python:pandas
This is an old revision of the document!
Table of Contents
1. Pandas
https://pandas.pydata.org/docs
Import thư viện:
import pandas as pd import numpy as np pd.options.mode.copy_on_write = True # https://pandas.pydata.org/docs/user_guide/copy_on_write.html
2. Create DataFrame
2.1 Create from a CSV
df = pd.read_csv('data/pandas/file.csv') df = pd.read_csv('data/pandas/file.csv', engine="pyarrow") df = pd.read_csv('data/pandas/file.csv', low_memory=False)
2.2 Create from a Dictionary
tempdict = {'col1':[1,2,3], 'col2':[4,5,6], 'col3':[7,8,9]} df = pd.DataFrame.from_dict(tempdict)
2.3 Create from a DataFrame
df_new = df[["eid", "type", "request_amount", "user_name", "status", "partner_status", "approved_time", "created_at", "updated_at", "date"]]
2.4 Append from DataFrames
df_ut_01 = pd.read_csv('../data/ns/ut_2021_01.csv', engine="pyarrow") df_ut_02 = pd.read_csv('../data/ns/ut_2021_12.csv', engine="pyarrow") df_ut = df_ut_01._append(df_ut_02)
3. Read DataFrame
3.1 Show Top n and Bottom n Rows
df.head(10) df.head() df.tail(15)
3.2 Show Columns and Data Type
df.columns df.dtypes
3.3 Summary Statistics
df.describe() df.describe(include='object')
3.4 Filtering Columns
df.COLUMN_NAME df['COLUMN_NAME'] df[['COLUMN_NAME_1', 'COLUMN_NAME_2']]
3.5 Filtering on Rows
df[df['COLUMN_NAME'] == 'Column Value'] df[(df['COLUMN_NAME_1']=='Column Value') & (df['COLUMN_NAME_2']==True)]
Ví dụ 1:
approve_from_time = np.datetime64('2021-01-01 00:00:00') approve_to_time = np.datetime64('2021-01-30 23:59:59') df_ut_data = df_ut[(df_ut['approved_time'] >= approve_from_time) & (df_ut['approved_time'] <= approve_to_time)] df_ut_data_success = df_ut[(df_ut['partner_status'] == 'SUCCESS') & (df_ut['approved_time'] >= approve_from_time) & (df_payout['approved_time'] <= approve_to_time)]
Ví dụ 2:
remit_accounts = ['TerraPay', 'tranglo', 'Nium', 'GME', 'OnePIP', 'Ipay', 'SingX', 'brdgx', 'hanshan', 'HanshanAPI', 'tranglo_CITAD', 'Nium_CITAD', 'SingX_CITAD'] df_payout_remit = df_ut_2024_new[ (df_ut_2024_new['type'] == 'TRANSFER_BANK') & (df_ut_2024_new["user_name"].isin(remit_accounts)) ]
4. Update DataFrame
4.1 Creating Column
df['NEW_COLUMN'] = df['NEW_COLUMN_1'] + df['NEW_COLUMN_2']
df_ut['date'] = pd.to_datetime(df_ut['date'], format='%d/%m/%Y') df_ut['approved_time'] = pd.to_datetime(df_ut['approved_time'], format='%d/%m/%Y %H:%M:%S') df_ut['created_at'] = pd.to_datetime(df_ut['created_at'], format='%d/%m/%Y %H:%M:%S') df_ut['updated_at'] = pd.to_datetime(df_ut['updated_at'], format='%d/%m/%Y %H:%M:%S') df_ut['processed_time'] = (df_ut['approved_time'] - df_ut['created_at']).dt.seconds df_ut['weekday'] = df_ut['created_at'].dt.day_name() df_ut['fee_postpaid'] = np.where(((df_ut['fee_type'] == 'POSTPAID' )) , df_ut['fee'], 0)
4.2. Update a DataFrame
df['COLUMN_NAME'] = 'Column Value'
Ví dụ 1:
def get_fee(entity): year = int(entity['created_at'].year) if year == 2024: return get_fee_2024(entity) return 0 def get_fee_2024(entity): month = int(entity['created_at'].month) day = entity['created_at'].day provider = entity['provider'] request_amount = entity['request_amount'] bank_no = entity['bank_no'] if provider == 'EPAY': return 2200 if provider == 'TECHCOMBANK' and bank_no != provider: return 275 if month >= 3 else 550 if provider == 'MBBANK': return 880 if bank_no != provider else 1980 if provider == 'VIETINBANK' and bank_no != provider: return 1650 if provider == 'VPBANK' and bank_no != provider: return 275 if month <= 2 else 605 if provider == 'BIDV': if bank_no != provider: return 1100 return 1705 if request_amount <= 300000000 else 1925 return 0 df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y') df['created_at'] = pd.to_datetime(df['created_at'], format='%d/%m/%Y %H:%M:%S') df['fee'] = df.apply(lambda x: get_fee(x), axis=1) df['processed_time'] = (df['approved_time'] - df['created_at']).dt.seconds df.loc[:,'day'] = df.created_at.dt.strftime('%d')
Ví dụ 2:
df_industries = pd.read_csv('../data/ns/user_industries.csv', low_memory=False) df_industries.rename(columns={ 'Tài khoản': 'name', 'Tên tài khoản': 'fullname', 'Trạng thái': 'status', 'Thời gian tạo': 'created_at', 'Ngành nghề': 'industry_name' }, inplace=True) df_industries = df_industries.drop(columns=['STT']) df_industries.insert(3, "Note", "") df_industries.set_index('name') df_industries
Ví dụ 3:
import math def get_fee_total(d): fee_total = 0 if math.isnan(d['2021_payin_fee_total']) == False: fee_total = fee_total + float(d['2021_payin_fee_total']) if math.isnan(d['2022_payin_fee_total']) == False: fee_total = fee_total + float(d['2022_payin_fee_total']) if math.isnan(d['2023_payin_fee_total']) == False: fee_total = fee_total + float(d['2023_payin_fee_total']) if math.isnan(d['2024_payin_fee_total']) == False: fee_total = fee_total + float(d['2024_payin_fee_total']) if math.isnan(d['2021_payout_fee_total']) == False: fee_total = fee_total + float(d['2021_payout_fee_total']) if math.isnan(d['2022_payout_fee_total']) == False: fee_total = fee_total + float(d['2022_payout_fee_total']) if math.isnan(d['2023_payout_fee_total']) == False: fee_total = fee_total + float(d['2023_payout_fee_total']) if math.isnan(d['2024_payout_fee_total']) == False: fee_total = fee_total + float(d['2024_payout_fee_total']) return fee_total df_fee['fee_total'] = df_fee.apply(lambda x: get_fee_total(x), axis=1)
4.3. Delete a DataFrame
del df
4.4. Convert nan to 0
df_statistic['count'] = df_statistic['count'].fillna(0)
4.5. Convert column type
df_statistic['count'] = df_statistic['count'].astype('Int64')
5. Group/Sort Data
df.groupby(['user_name'])['request_amount'].sum() df_groups = df.groupby(['transaction_id']).agg({'transaction_id':'size', 'fee': 'sum', 'amount': 'mean'}).rename(columns={'transaction_id':'count', 'fee': 'fee_total', 'amount': 'amount_avg'}).reset_index() df_groups.sort_values(by='count', ascending=False) df_groups.sort_values(by='avg', ascending=False) # Tính giá trị trung bình 1 cột df["internal_time"].mean()
6. Merge 2 Dataframe
df_statistic = pd.merge(df_users, df_industries, left_on='name', right_on='name', how = 'left')
7. Output
df.to_csv('data/pandas/output.csv') df.to_json() df.to_html() with pd.ExcelWriter('../data/ns/dd/file.xlsx') as writer: df_1.to_excel(writer, sheet_name='sheet 1') df_2_excel(writer, sheet_name='sheet 2')
development/python/pandas.1731050705.txt.gz · Last modified: 2024/11/08 07:25 by tungnt