01
Try using numpy for operations whenever possible
Use to_numpy()
instead of .values
Importing¶
Creating Dataframe¶
Dataset¶
Manual¶
perf = pd.DataFrame(
columns = ['Season', 'Appearances', 'Goals', 'Assists'],
data = [
["2021/2022", 39, 24, 3],
["2020/2021", 44, 36, 4],
["2019/2020", 46, 37, 7]
]
)
Filtering¶
query
and eval
¶
## query - better than boolean masking
df = df.query("""
@date_start <= Date <= @date_end and \
Type in @event_type
""")
mask = df.eval("something") ## gives the boolean mask corresponding to this
Boolean Masking¶
Editing Values¶
(
prediction
.assign(
Rating = prediction.Rating.to_numpy() * 100,
Value = prediction.Value.to_numpy() * 50,
)
)
prediction[['Rating', 'Value']] = prediction[['Rating', 'Value']].to_numpy() * 100
Read/Write¶
Single File¶
df = pd.read_csv(
file,
engine="pyarrow", backend_dtypes="pyarrow"
)
dfs = pd.read_excel('GDSC.xlsx', sheet_name="Something") ## gives all
dfs = pd.read_excel('GDSC.xlsx', sheet_name=None) ## gives all
for table, df in dfs.items():
print(table, df)
Multiple Files¶
raw_formula_student = pd.DataFrame()
for file in files:
if( ".csv" == file[-4:] ):
raw_formula_student = pd.concat(
[raw_formula_student, read_file(file)]
)
Read Google Sheets¶
Best method
import gspread
gc = gspread.service_account("key.json")
from gspread_dataframe import get_as_dataframe as get_gsheet, set_with_dataframe as set_gsheet
from functools import lru_cache
def gsheet_to_csv(spreadsheet_id, sheet_id=None, sheet_name=None):
## make sure the spreadsheet is publicly viewable
if sheet_id is not None:
link = f"https://docs.google.com/spreadsheets/d/{spreadsheet_id}/gviz/tq?tqx=out:csv&gid={sheet_id}"
elif sheet_name is not None:
link = f"https://docs.google.com/spreadsheets/d/{spreadsheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
else:
return None
df = pd.read_csv(
link,
engine = "pyarrow",
backend_dtypes = "pyarrow"
)
return df
def gsheet_by_api(spreadsheet_id, sheet_id=None, sheet_name=None):
gsheet = gc.open_by_key(spreadsheet_id)
if sheet_id is not None:
sheet = gsheet.get_worksheet_by_id(sheet_id)
elif sheet_name is not None:
sheet = getattr(gsheet, sheet_name)
else:
return None
df = get_gsheet(sheet, evaluate_formulas=True)
return df
@lru_cache(maxsize = 128) ## or st.cache_data(ttl=ttl_long)
def read_gsheet(spreadsheet_id, sheet_id=None, sheet_name=None, parse_dates=None, csv=True):
if csv is True:
df = gsheet_to_csv(spreadsheet_id, sheet_id, sheet_name)
else:
df = gsheet_by_api(spreadsheet_id, sheet_id, sheet_name)
df = df.dropna(how= "all", axis="index")
df = df.dropna(how= "all", axis="columns")
for col_name in df.columns:
if "date" in col_name.lower() or "time".lower() in col_name:
df[col_name] = pd.to_datetime(df[col_name])
return df
Old method (Not working)
gsheetkey = "1kax9m1FKah7cWPwylxhdJSyqF5eVALjgRbxyPuPg7g0"
sheet_name = 'Social_Media_Analysis'
url= f'[لم يتم العثور على الصفحة](https://docs.google.com/spreadsheet/ccc?key={gsheetkey}&output=xlsx')
sheet = pd.read_excel(url, sheet_name = sheet_name)
Useful Functions¶
df.head()
df.tail()
df.describe()
df.dropna()
df.select_dtypes(int)
df.memory_usage(deep=True).sum()/1_000_000 ## in MBs
Function | Function Application |
---|---|
pipe() | Table-wise |
assign() | |
apply() | Row/column-wise (not good to use) |
applymap() | Element-wise (not good to use) |
df.iloc[:, 2:11] = (
df.iloc[:, 2:11]
.apply(lambda x: x.str.replace(',', '.'))
.to_numpy()
.astype(float)
)
View null values¶
Correlation¶
Correlation Matrix¶
Correlation Ranking¶
(
formula_student
.iloc[:, 1:11]
.corr("Overall Scores")
.sort_values("Correlation", ascending=False)
.iloc[1:, :] ## remove the obvious overall scores = 1.00
)
## this is unnecessarily complicated way i used before
(
formula_student
.iloc[:, 1:11]
.corr()
.rename(columns={"Overall Scores":"Correlation"})
[["Correlation"]]
.sort_values("Correlation", ascending=False)
.iloc[1:, :] ## remove the obvious overall scores = 1.00
)
Deleting¶
Drop first \(n\) records
Sorting¶
(
perf
.sort_values([
"Season",
"Rating",
"Value"
], ascending=[
True,
True,
False
])
.reset_index(drop=True)
)
Different approach
(
formula_student[["Cost", "Overall Placing"]]
.sort_values("Cost", ascending = False)
.head(10)
.sort_values("Overall Placing")
)
Grouping/Aggregation¶
Single Aggregation Function¶
All Columns¶
(
merged
.groupby(
["Year"],
as_index=False ## if Year should not become index of dataframe
)
.mean()
.rename(columns={
"Value": "MeanValue",
"Overall": "MeanOverall",
"CPIValue": "MeanCPIValue"
})
)
## or mean = merged.groupby(["Year"]).mean().reset_index()
Renaming is for obvious reasons
Particular Columns¶
Multiple Aggregation Function¶
summary_df = (
df[["latitude", "longitude", "emission"]]
.groupby(["latitude", "longitude"], as_index=False)
.agg({
"emission": ["median","std", "mean", "min", "max"]
})
)
summary_df.columns = summary_df.columns.map('_'.join)
df = df.merge(summary_df, how="inner")
Round-off all results
formula_student[["Car", "Overall Scores"]]
.groupby(
["Competition"]
)
.agg({
'Car' : ["count"],
'Overall Scores' : ["mean", "min", "max"]
})
.round(1)
Round-off specific results
def mean_func(x):
return round(x.mean(), 1)
formula_student[["Car", "Overall Scores"]]
.groupby(
["Competition"]
)
.agg({
'Car' : ["count"],
'Overall Scores' : [mean_func, "min", "max"]
})
IDK¶
The count of each value until it changes to another value
index value
0 10
1 10
2 23
3 23
4 9
5 9
6 9
7 10
8 10
9 10
10 10
11 12
---
index count
10 2
23 2
9 3
10 4
12 1
col = 'col_name'
df = (
df
.groupby(
df[col]
.ne( ## not equal to previous value; ie change occured
df[col]
.shift()
)
.cumsum()
)
[col]
.value_counts()
.reset_index(level=0, drop=True)
)
Explanation
## This is the intermediate dataframe produced
## We then group by cumsum
index value shifted not_equal cumsum
0 10 NaN True 1
1 10 10.0 False 1
2 23 10.0 True 2
3 23 23.0 False 2
4 9 23.0 True 3
5 9 9.0 False 3
6 9 9.0 False 3
7 10 9.0 True 4
8 10 10.0 False 4
9 10 10.0 False 4
10 10 10.0 False 4
11 12 10.0 True 5
col = 'col_name'
changes = (
df[col]
.diff()
.ne(0)
.cumsum()
)
df = (
df
.groupby([changes,col])
.size()
.reset_index(level=0, drop=True)
)
Join/Merge¶
merged = pd.merge(
ratings[ratings["Value"] >= 10000],
cpi
).sort_values("Value").reset_index(drop=True)
Plotting¶
Plotting Backend¶
Type Casting¶
Lagged Value¶
alias::shift lag
Performance Optimization¶
https://pythonspeed.com/datascience
Selectively-load columns¶
Selectively-load rows¶
Reducing variables in functions¶
instead of
def process_data():
data = load_1GB_of_data() ## ← `data` var lives too long
return modify2(modify1(data))
Use correct dtypes
¶
df = pd.read_csv(
"dataset.csv",
dtype = {
"Age": "uint8",
"Year": "uint16",
"Time": "datetime"
"Salary": "ufloat32",
"Gender": "category",
"Name": "string[pyarrow]"
}
)
Dynamically cleaning up after reading based on datatype; but i would recommend above
def get_optimal_numeric_type(c_min: float, c_max: float, col_type: str) -> str:
"""
Determines the optimal numeric data type for a given range of values.
Parameters
----------
c_min : float
The minimum value of the data.
c_max : float
The maximum value of the data.
col_type : str
The current data type of the column ('int' or 'float').
Returns
-------
optimal_type : str
The optimal data type for the given range of values.
"""
type_info = np.iinfo if col_type == 'int' else np.finfo
for dtype in [np.int8, np.int16, np.int32, np.int64, np.float16, np.float32, np.float64]:
if col_type in str(dtype):
if c_min > type_info(dtype).min and c_max < type_info(dtype).max:
return dtype
return None
""" Based on the data type and the range of values, the function determines the smallest possible data type that can accommodate the data without losing information. For example, if the data type is an integer and the range of values fits within the bounds of an int8 data type, the function converts the column data type to int8: """
def reduce_memory_usage(df: pd.DataFrame) -> pd.DataFrame:
"""
Reduces memory usage of a pandas DataFrame by converting its columns to the most memory-efficient data types
without losing information.
Parameters
----------
df : pd.DataFrame
The input pandas DataFrame that needs memory optimization.
Returns
-------
df : pd.DataFrame
The optimized pandas DataFrame with reduced memory usage.
"""
## Iterate through each column in the DataFrame
df_copy = df.copy()
for col in df_copy.columns:
col_type = df_copy[col].dtype
## Check if the data type is not an object (i.e., numeric type)
if col_type != object:
c_min, c_max = df_copy[col].min(), df_copy[col].max()
col_type_str = 'int' if 'int' in str(col_type) else 'float'
optimal_type = get_optimal_numeric_type(c_min, c_max, col_type_str)
if optimal_type:
df_copy[col] = df_copy[col].astype(optimal_type)
## If the data type is an object, convert the column to a 'category' data type
else:
df_copy[col] = df_copy[col].astype('category')
## Return the optimized DataFrame with reduced memory usage
return df_copy
Use [[pyarrow]]
engine¶
Use Modin¶
Clipping¶
## using pandas
df = df.clip(
lower = df['Column'].quantile(.25, interpolation="midpoint"),
upper = df['Column'].quantile(.75, interpolation="midpoint")
)
Chaining¶
SQL¶
Exporting to HTML¶
html = """
<head>
<link rel="stylesheet" href="../../../backend/plugins/bootstrap/bootstrap.min.css">
<script src="../../../backend/plugins/bootstrap/bootstrap.min.js"></script>
</head>
<body>
"""
<body>
html += blah_df.head.to_html(classes='table table-stripped')
html += "</body>"
exporter = open("blah_df.html", 'w')
exporter.write(html)
exporter.close()
Get¶
Get last letter of every row¶
First Name¶
Year/Month/Date¶
Time Series¶
df.shift(10)
df.diff(10)
## window functions
df.rolling(10).mean()
df.rolling(10, center=True).mean()
df.expanding(10).mean()
df.expanding(10, center=True).mean()
## exponential weighted window
df.ewm(10).mean()
df.ewm(10, center=True).mean()
Faster windows using numpy¶
def rolling_window(a, window):
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
np.mean(rolling_window(s, 2), axis=1)
np.median(rolling_window(s, 2), axis=1)
Missing Values¶
def check(data):
print(data.isnull().values.any())
sheet.apply(check) ## check if col has missing value
sheet.apply(check, axis=1) ## check if row has missing value
sheet.pipe(check) ## check if dataframe has missing value
My Cleaning¶
def clean_df(data):
df = data.copy()
for column in df.columns:
if "date" in column:
df[column] = pd.to_datetime(sheet[column])
df = df.apply(lambda col: col.str.strip() if col.dtype == "object" else col)
return df
sheet.pipe(clean_df)