Skip to content

01

Try using numpy for operations whenever possible

Use to_numpy() instead of .values

Importing

import pandas as pd
pd.set_option('max_columns', 200)

Creating Dataframe

Dataset

  df = pd.read_csv("dataset.csv")

Manual

  perf = pd.DataFrame(
      columns = ['Season', 'Appearances', 'Goals', 'Assists'],
      data = [
          ["2021/2022", 39, 24, 3],
          ["2020/2021", 44, 36, 4],
          ["2019/2020", 46, 37, 7]
      ]
  )

Filtering

query and eval

  ## query - better than boolean masking
  df = df.query("""
  @date_start <= Date <= @date_end and \
  Type in @event_type
  """)

  mask = df.eval("something") ## gives the boolean mask corresponding to this

Boolean Masking

  df = df[
    (df["Date"] >= date_start) &
    (df["Date"] <= date_end) &
    (df["Type"].isin(event_type))
  ]

Editing Values

(
  prediction
  .assign(
    Rating = prediction.Rating.to_numpy() * 100,
    Value = prediction.Value.to_numpy() * 50,
  )
)

prediction[['Rating', 'Value']] = prediction[['Rating', 'Value']].to_numpy() * 100

Read/Write

Single File

  df = pd.read_csv(
    file,
    engine="pyarrow", backend_dtypes="pyarrow"
  )

  dfs = pd.read_excel('GDSC.xlsx', sheet_name="Something") ## gives all
  dfs = pd.read_excel('GDSC.xlsx', sheet_name=None) ## gives all
  for table, df in dfs.items():
      print(table, df)
  file_name = file[:-4] + ".csv"
  df.to_csv(
    os.getcwd() + "\\" + rel + file_name,
    index = False
  )

Multiple Files

  raw_formula_student = pd.DataFrame()
  for file in files:
      if( ".csv" == file[-4:] ):
          raw_formula_student = pd.concat(
              [raw_formula_student, read_file(file)]
          )

Read Google Sheets

Best method

    import gspread
    gc = gspread.service_account("key.json")

    from gspread_dataframe import get_as_dataframe as get_gsheet, set_with_dataframe as set_gsheet

    from functools import lru_cache

    def gsheet_to_csv(spreadsheet_id, sheet_id=None, sheet_name=None):
      ## make sure the spreadsheet is publicly viewable
      if sheet_id is not None:
        link = f"https://docs.google.com/spreadsheets/d/{spreadsheet_id}/gviz/tq?tqx=out:csv&gid={sheet_id}"
      elif sheet_name is not None:
        link = f"https://docs.google.com/spreadsheets/d/{spreadsheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
      else:
        return None

        df = pd.read_csv(
          link,
            engine = "pyarrow",
          backend_dtypes = "pyarrow"
        )
        return df

    def gsheet_by_api(spreadsheet_id, sheet_id=None, sheet_name=None):
          gsheet = gc.open_by_key(spreadsheet_id)
      if sheet_id is not None:
        sheet = gsheet.get_worksheet_by_id(sheet_id)
      elif sheet_name is not None:
        sheet = getattr(gsheet, sheet_name)
      else:
        return None

      df = get_gsheet(sheet, evaluate_formulas=True)
        return df

    @lru_cache(maxsize = 128) ## or st.cache_data(ttl=ttl_long)
    def read_gsheet(spreadsheet_id, sheet_id=None, sheet_name=None, parse_dates=None, csv=True):
        if csv is True:
          df = gsheet_to_csv(spreadsheet_id, sheet_id, sheet_name)
        else:
          df = gsheet_by_api(spreadsheet_id, sheet_id, sheet_name)

      df = df.dropna(how= "all", axis="index")
      df = df.dropna(how= "all", axis="columns")

      for col_name in df.columns:
        if "date" in col_name.lower() or "time".lower() in col_name:
          df[col_name] = pd.to_datetime(df[col_name])

      return df

Old method (Not working)

    gsheetkey = "1kax9m1FKah7cWPwylxhdJSyqF5eVALjgRbxyPuPg7g0"
    sheet_name = 'Social_Media_Analysis'
    url= f'[لم يتم العثور على الصفحة](https://docs.google.com/spreadsheet/ccc?key={gsheetkey}&output=xlsx')

    sheet = pd.read_excel(url, sheet_name = sheet_name)

Useful Functions

df.head()
df.tail()

df.describe()
df.dropna()

df.select_dtypes(int)

df.memory_usage(deep=True).sum()/1_000_000 ## in MBs 
df = df.set_index(
    "Season",
    "Player Name"
)
df.unique()
df.value_counts()
100 * df['col'].value_counts() / df['col'].shape[0]
df['Age'].avg()
Function Function Application
pipe() Table-wise
assign()
apply() Row/column-wise (not good to use)
applymap() Element-wise (not good to use)
df.iloc[:, 2:11] = (
  df.iloc[:, 2:11]
  .apply(lambda x: x.str.replace(',', '.'))
  .to_numpy()
  .astype(float)
)

View null values

  formula_student[
      formula_student.isna().any(axis=1)
  ]

Correlation

Correlation Matrix

  (
    formula_student
    [['Cost', 'Design', 'Overall Scores']]
    .corr()
  )

Correlation Ranking

  (
      formula_student
      .iloc[:, 1:11]
      .corr("Overall Scores")
      .sort_values("Correlation", ascending=False)
      .iloc[1:, :] ## remove the obvious overall scores = 1.00
  )
  ## this is unnecessarily complicated way i used before
  (
      formula_student
      .iloc[:, 1:11]
      .corr()
      .rename(columns={"Overall Scores":"Correlation"})
      [["Correlation"]]
      .sort_values("Correlation", ascending=False)
      .iloc[1:, :] ## remove the obvious overall scores = 1.00
  )

Deleting

Drop first \(n\) records

  df = df.iloc[n: , : ]

Sorting

(
  perf
  .sort_values("Season")
  .reset_index(drop=True)
)
(
  perf
  .sort_values([
    "Season",
    "Rating",
    "Value"
  ], ascending=[
    True,
    True,
    False
  ])
  .reset_index(drop=True)
)

Different approach

  (
    formula_student[["Cost", "Overall Placing"]]
    .sort_values("Cost", ascending = False)
    .head(10)
    .sort_values("Overall Placing")
  )

Grouping/Aggregation

meanCols = {
    "Value": "MeanValue",
    "Overall": "MeanOverall",
    "CPIValue": "MeanCPIValue"
}

Single Aggregation Function

df.values.mean()
# don't do df.mean().mean()

np.nanmean(df.values)
np.nanstd(df.values)

All Columns

    (
      merged
      .groupby(
        ["Year"],
        as_index=False ## if Year should not become index of dataframe
      )
      .mean()
      .rename(columns={
        "Value": "MeanValue",
        "Overall": "MeanOverall",
        "CPIValue": "MeanCPIValue"
      })
    )
    ## or mean = merged.groupby(["Year"]).mean().reset_index()

Renaming is for obvious reasons

Particular Columns

    mean = merged[["Year","Value"]].groupby(
      ["Year"],
      as_index=False
    ).mean().rename(columns=meanCols)

Multiple Aggregation Function

  summary_df = (
      df[["latitude", "longitude", "emission"]]
      .groupby(["latitude", "longitude"], as_index=False)
      .agg({
          "emission": ["median","std", "mean", "min", "max"]
      })
  )
  summary_df.columns = summary_df.columns.map('_'.join)

  df = df.merge(summary_df, how="inner")
  (
    merged
    .groupby(
      ["Year"],
      as_index=False
    )
    .agg(
      ['mean', 'sum']
    )
  )

Round-off all results

    formula_student[["Car", "Overall Scores"]]
        .groupby(
            ["Competition"]
        )
        .agg({
            'Car' : ["count"],
            'Overall Scores' : ["mean", "min", "max"]
        })
        .round(1)

Round-off specific results

    def mean_func(x):
        return round(x.mean(), 1)

    formula_student[["Car", "Overall Scores"]]
        .groupby(
            ["Competition"]
        )
        .agg({
            'Car' : ["count"],
            'Overall Scores' : [mean_func, "min", "max"]
        })

IDK

The count of each value until it changes to another value

  index  value
      0     10
      1     10
      2     23
      3     23
      4      9
      5      9
      6      9
      7     10
      8     10
      9     10
     10     10
     11     12

  ---

  index count
     10     2
     23     2
      9     3
     10     4
     12     1
  col = 'col_name'

  df = (
    df
    .groupby(
      df[col]
      .ne( ## not equal to previous value; ie change occured
        df[col]
        .shift()
      )
      .cumsum()
    )
    [col]
    .value_counts()
    .reset_index(level=0, drop=True)
  )

Explanation

      ## This is the intermediate dataframe produced
      ## We then group by cumsum

      index  value  shifted  not_equal  cumsum
          0     10      NaN       True       1
          1     10     10.0      False       1
          2     23     10.0       True       2
          3     23     23.0      False       2
          4      9     23.0       True       3
          5      9      9.0      False       3
          6      9      9.0      False       3
          7     10      9.0       True       4
          8     10     10.0      False       4
          9     10     10.0      False       4
         10     10     10.0      False       4
         11     12     10.0       True       5
  col = 'col_name'

  changes = (
    df[col]
    .diff()
    .ne(0)
    .cumsum()
  )

  df = (
    df
    .groupby([changes,col])
    .size()
    .reset_index(level=0, drop=True)
  )

Join/Merge

merged = pd.merge(
    ratings[ratings["Value"] >= 10000],
    cpi
).sort_values("Value").reset_index(drop=True)

Plotting

merged.plot(
  x="Season",
  ylabel="Feature",
  title="Features over Time"
)

Plotting Backend

  pd.options.plotting.backend = 'plotly'
  df.plot(backend='plotly')

Type Casting

df['Fee'] = df['Fee'].to_numpy().astype(float)
## df['Fee'] = df['Fee'].astype(float)

Lagged Value

alias::shift lag

nba["wpc_lag"] = (
  nba
  .groupby("Team")
  ["wpc"]
  .shift(1)
)

Performance Optimization

https://pythonspeed.com/datascience

Selectively-load columns

  df = pd.read_csv(
    "dataset.csv",
    usecols = [
      "First Name",
      "Last Name"
    ]
  )

Selectively-load rows

  df = pd.read_csv(
    "dataset.csv",
    nrows = 10
  )

Reducing variables in functions

  def process_data():
      return modify2(modify1(load_1GB_of_data()))

instead of

  def process_data():
      data = load_1GB_of_data() ## ← `data` var lives too long
      return modify2(modify1(data))

Use correct dtypes

  df = pd.read_csv(
    "dataset.csv",
    dtype = {
      "Age": "uint8",
      "Year": "uint16",
      "Time": "datetime"
      "Salary": "ufloat32",
      "Gender": "category",
      "Name": "string[pyarrow]"
    }
  )

Dynamically cleaning up after reading based on datatype; but i would recommend above

    def get_optimal_numeric_type(c_min: float, c_max: float, col_type: str) -> str:
        """
        Determines the optimal numeric data type for a given range of values.

        Parameters
        ----------
        c_min : float
            The minimum value of the data.
        c_max : float
            The maximum value of the data.
        col_type : str
            The current data type of the column ('int' or 'float').

        Returns
        -------
        optimal_type : str
            The optimal data type for the given range of values.
        """
        type_info = np.iinfo if col_type == 'int' else np.finfo
        for dtype in [np.int8, np.int16, np.int32, np.int64, np.float16, np.float32, np.float64]:
            if col_type in str(dtype):
                if c_min > type_info(dtype).min and c_max < type_info(dtype).max:
                    return dtype
        return None

    """ Based on the data type and the range of values, the function determines the smallest possible data type that can accommodate the data without losing information. For example, if the data type is an integer and the range of values fits within the bounds of an int8 data type, the function converts the column data type to int8: """

    def reduce_memory_usage(df: pd.DataFrame) -> pd.DataFrame:
        """
        Reduces memory usage of a pandas DataFrame by converting its columns to the most memory-efficient data types
        without losing information.

        Parameters
        ----------
        df : pd.DataFrame
            The input pandas DataFrame that needs memory optimization.

        Returns
        -------
        df : pd.DataFrame
            The optimized pandas DataFrame with reduced memory usage.
        """

        ## Iterate through each column in the DataFrame
        df_copy = df.copy()
        for col in df_copy.columns:
            col_type = df_copy[col].dtype

            ## Check if the data type is not an object (i.e., numeric type)
            if col_type != object:
                c_min, c_max = df_copy[col].min(), df_copy[col].max()
                col_type_str = 'int' if 'int' in str(col_type) else 'float'
                optimal_type = get_optimal_numeric_type(c_min, c_max, col_type_str)
                if optimal_type:
                    df_copy[col] = df_copy[col].astype(optimal_type)
            ## If the data type is an object, convert the column to a 'category' data type
            else:
                df_copy[col] = df_copy[col].astype('category')

        ## Return the optimized DataFrame with reduced memory usage
        return df_copy

Use [[pyarrow]] engine

  df = pd.read_csv(
    "dataset.csv",
    engine = "pyarrow"
  )

Use Modin

Clipping

## using pandas
df = df.clip(
  lower = df['Column'].quantile(.25, interpolation="midpoint"),           
  upper = df['Column'].quantile(.75, interpolation="midpoint")
)

Chaining

(
  df
  .pipe(function)
  .assign(
    ensnetns = nesntens
  )
  .astype({"highway": np.int8})
)

SQL

df = pd.read_sql(query, engine)

Exporting to HTML

html = """
<head>
  <link rel="stylesheet" href="../../../backend/plugins/bootstrap/bootstrap.min.css">
  <script src="../../../backend/plugins/bootstrap/bootstrap.min.js"></script>
</head>
<body>
"""
<body>
html += blah_df.head.to_html(classes='table table-stripped')
html += "</body>"

exporter = open("blah_df.html", 'w')
exporter.write(html)
exporter.close()

Get

Get last letter of every row

  spf_unrate["PERIOD"].str.get(-1)

First Name

  df["Name"] = df["Name"].str.split().str.get(0)

Year/Month/Date

  fred["YEAR"] = pd.DatetimeIndex(fred['DATE']).year
  fred = fred.drop("DATE", axis=1)
  fred

Time Series

df.shift(10)
df.diff(10)

## window functions
df.rolling(10).mean()
df.rolling(10, center=True).mean()

df.expanding(10).mean()
df.expanding(10, center=True).mean()

## exponential weighted window
df.ewm(10).mean()
df.ewm(10, center=True).mean()

Missing Values

def check(data):
  print(data.isnull().values.any())

sheet.apply(check)          ## check if col has missing value
sheet.apply(check, axis=1)  ## check if row has missing value
sheet.pipe(check)           ## check if dataframe has missing value

My Cleaning

def clean_df(data):
  df = data.copy()

  for column in df.columns:
    if "date" in column:
      df[column] = pd.to_datetime(sheet[column])

  df = df.apply(lambda col: col.str.strip() if col.dtype == "object" else col)

  return df

sheet.pipe(clean_df)
Last Updated: 2024-05-12 ; Contributors: AhmedThahir

Comments