\[ \newcommand{\figleft}{{\em (Left)}} \newcommand{\figcenter}{{\em (Center)}} \newcommand{\figright}{{\em (Right)}} \newcommand{\figtop}{{\em (Top)}} \newcommand{\figbottom}{{\em (Bottom)}} \newcommand{\captiona}{{\em (a)}} \newcommand{\captionb}{{\em (b)}} \newcommand{\captionc}{{\em (c)}} \newcommand{\captiond}{{\em (d)}} \newcommand{\newterm}[1]{{\bf #1}} \def\figref#1{figure~\ref{#1}} \def\Figref#1{Figure~\ref{#1}} \def\twofigref#1#2{figures \ref{#1} and \ref{#2}} \def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}} \def\secref#1{section~\ref{#1}} \def\Secref#1{Section~\ref{#1}} \def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}} \def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}} \def\eqref#1{equation~\ref{#1}} \def\Eqref#1{Equation~\ref{#1}} \def\plaineqref#1{\ref{#1}} \def\chapref#1{chapter~\ref{#1}} \def\Chapref#1{Chapter~\ref{#1}} \def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}} \def\algref#1{algorithm~\ref{#1}} \def\Algref#1{Algorithm~\ref{#1}} \def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}} \def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}} \def\partref#1{part~\ref{#1}} \def\Partref#1{Part~\ref{#1}} \def\twopartref#1#2{parts \ref{#1} and \ref{#2}} \def\ceil#1{\lceil #1 \rceil} \def\floor#1{\lfloor #1 \rfloor} \def\1{\bm{1}} \newcommand{\train}{\mathcal{D}} \newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}} \newcommand{\test}{\mathcal{D_{\mathrm{test}}}} \def\eps{{\epsilon}} \def\reta{{\textnormal{$\eta$}}} \def\ra{{\textnormal{a}}} \def\rb{{\textnormal{b}}} \def\rc{{\textnormal{c}}} \def\rd{{\textnormal{d}}} \def\re{{\textnormal{e}}} \def\rf{{\textnormal{f}}} \def\rg{{\textnormal{g}}} \def\rh{{\textnormal{h}}} \def\ri{{\textnormal{i}}} \def\rj{{\textnormal{j}}} \def\rk{{\textnormal{k}}} \def\rl{{\textnormal{l}}} \def\rn{{\textnormal{n}}} \def\ro{{\textnormal{o}}} \def\rp{{\textnormal{p}}} \def\rq{{\textnormal{q}}} \def\rr{{\textnormal{r}}} \def\rs{{\textnormal{s}}} \def\rt{{\textnormal{t}}} \def\ru{{\textnormal{u}}} \def\rv{{\textnormal{v}}} \def\rw{{\textnormal{w}}} \def\rx{{\textnormal{x}}} \def\ry{{\textnormal{y}}} \def\rz{{\textnormal{z}}} \def\rvepsilon{{\mathbf{\epsilon}}} \def\rvtheta{{\mathbf{\theta}}} \def\rva{{\mathbf{a}}} \def\rvb{{\mathbf{b}}} \def\rvc{{\mathbf{c}}} \def\rvd{{\mathbf{d}}} \def\rve{{\mathbf{e}}} \def\rvf{{\mathbf{f}}} \def\rvg{{\mathbf{g}}} \def\rvh{{\mathbf{h}}} \def\rvu{{\mathbf{i}}} \def\rvj{{\mathbf{j}}} \def\rvk{{\mathbf{k}}} \def\rvl{{\mathbf{l}}} \def\rvm{{\mathbf{m}}} \def\rvn{{\mathbf{n}}} \def\rvo{{\mathbf{o}}} \def\rvp{{\mathbf{p}}} \def\rvq{{\mathbf{q}}} \def\rvr{{\mathbf{r}}} \def\rvs{{\mathbf{s}}} \def\rvt{{\mathbf{t}}} \def\rvu{{\mathbf{u}}} \def\rvv{{\mathbf{v}}} \def\rvw{{\mathbf{w}}} \def\rvx{{\mathbf{x}}} \def\rvy{{\mathbf{y}}} \def\rvz{{\mathbf{z}}} \def\erva{{\textnormal{a}}} \def\ervb{{\textnormal{b}}} \def\ervc{{\textnormal{c}}} \def\ervd{{\textnormal{d}}} \def\erve{{\textnormal{e}}} \def\ervf{{\textnormal{f}}} \def\ervg{{\textnormal{g}}} \def\ervh{{\textnormal{h}}} \def\ervi{{\textnormal{i}}} \def\ervj{{\textnormal{j}}} \def\ervk{{\textnormal{k}}} \def\ervl{{\textnormal{l}}} \def\ervm{{\textnormal{m}}} \def\ervn{{\textnormal{n}}} \def\ervo{{\textnormal{o}}} \def\ervp{{\textnormal{p}}} \def\ervq{{\textnormal{q}}} \def\ervr{{\textnormal{r}}} \def\ervs{{\textnormal{s}}} \def\ervt{{\textnormal{t}}} \def\ervu{{\textnormal{u}}} \def\ervv{{\textnormal{v}}} \def\ervw{{\textnormal{w}}} \def\ervx{{\textnormal{x}}} \def\ervy{{\textnormal{y}}} \def\ervz{{\textnormal{z}}} \def\rmA{{\mathbf{A}}} \def\rmB{{\mathbf{B}}} \def\rmC{{\mathbf{C}}} \def\rmD{{\mathbf{D}}} \def\rmE{{\mathbf{E}}} \def\rmF{{\mathbf{F}}} \def\rmG{{\mathbf{G}}} \def\rmH{{\mathbf{H}}} \def\rmI{{\mathbf{I}}} \def\rmJ{{\mathbf{J}}} \def\rmK{{\mathbf{K}}} \def\rmL{{\mathbf{L}}} \def\rmM{{\mathbf{M}}} \def\rmN{{\mathbf{N}}} \def\rmO{{\mathbf{O}}} \def\rmP{{\mathbf{P}}} \def\rmQ{{\mathbf{Q}}} \def\rmR{{\mathbf{R}}} \def\rmS{{\mathbf{S}}} \def\rmT{{\mathbf{T}}} \def\rmU{{\mathbf{U}}} \def\rmV{{\mathbf{V}}} \def\rmW{{\mathbf{W}}} \def\rmX{{\mathbf{X}}} \def\rmY{{\mathbf{Y}}} \def\rmZ{{\mathbf{Z}}} \def\ermA{{\textnormal{A}}} \def\ermB{{\textnormal{B}}} \def\ermC{{\textnormal{C}}} \def\ermD{{\textnormal{D}}} \def\ermE{{\textnormal{E}}} \def\ermF{{\textnormal{F}}} \def\ermG{{\textnormal{G}}} \def\ermH{{\textnormal{H}}} \def\ermI{{\textnormal{I}}} \def\ermJ{{\textnormal{J}}} \def\ermK{{\textnormal{K}}} \def\ermL{{\textnormal{L}}} \def\ermM{{\textnormal{M}}} \def\ermN{{\textnormal{N}}} \def\ermO{{\textnormal{O}}} \def\ermP{{\textnormal{P}}} \def\ermQ{{\textnormal{Q}}} \def\ermR{{\textnormal{R}}} \def\ermS{{\textnormal{S}}} \def\ermT{{\textnormal{T}}} \def\ermU{{\textnormal{U}}} \def\ermV{{\textnormal{V}}} \def\ermW{{\textnormal{W}}} \def\ermX{{\textnormal{X}}} \def\ermY{{\textnormal{Y}}} \def\ermZ{{\textnormal{Z}}} \def\vzero{{\bm{0}}} \def\vone{{\bm{1}}} \def\vmu{{\bm{\mu}}} \def\vtheta{{\bm{\theta}}} \def\va{{\bm{a}}} \def\vb{{\bm{b}}} \def\vc{{\bm{c}}} \def\vd{{\bm{d}}} \def\ve{{\bm{e}}} \def\vf{{\bm{f}}} \def\vg{{\bm{g}}} \def\vh{{\bm{h}}} \def\vi{{\bm{i}}} \def\vj{{\bm{j}}} \def\vk{{\bm{k}}} \def\vl{{\bm{l}}} \def\vm{{\bm{m}}} \def\vn{{\bm{n}}} \def\vo{{\bm{o}}} \def\vp{{\bm{p}}} \def\vq{{\bm{q}}} \def\vr{{\bm{r}}} \def\vs{{\bm{s}}} \def\vt{{\bm{t}}} \def\vu{{\bm{u}}} \def\vv{{\bm{v}}} \def\vw{{\bm{w}}} \def\vx{{\bm{x}}} \def\vy{{\bm{y}}} \def\vz{{\bm{z}}} \def\evalpha{{\alpha}} \def\evbeta{{\beta}} \def\evepsilon{{\epsilon}} \def\evlambda{{\lambda}} \def\evomega{{\omega}} \def\evmu{{\mu}} \def\evpsi{{\psi}} \def\evsigma{{\sigma}} \def\evtheta{{\theta}} \def\eva{{a}} \def\evb{{b}} \def\evc{{c}} \def\evd{{d}} \def\eve{{e}} \def\evf{{f}} \def\evg{{g}} \def\evh{{h}} \def\evi{{i}} \def\evj{{j}} \def\evk{{k}} \def\evl{{l}} \def\evm{{m}} \def\evn{{n}} \def\evo{{o}} \def\evp{{p}} \def\evq{{q}} \def\evr{{r}} \def\evs{{s}} \def\evt{{t}} \def\evu{{u}} \def\evv{{v}} \def\evw{{w}} \def\evx{{x}} \def\evy{{y}} \def\evz{{z}} \def\mA{{\bm{A}}} \def\mB{{\bm{B}}} \def\mC{{\bm{C}}} \def\mD{{\bm{D}}} \def\mE{{\bm{E}}} \def\mF{{\bm{F}}} \def\mG{{\bm{G}}} \def\mH{{\bm{H}}} \def\mI{{\bm{I}}} \def\mJ{{\bm{J}}} \def\mK{{\bm{K}}} \def\mL{{\bm{L}}} \def\mM{{\bm{M}}} \def\mN{{\bm{N}}} \def\mO{{\bm{O}}} \def\mP{{\bm{P}}} \def\mQ{{\bm{Q}}} \def\mR{{\bm{R}}} \def\mS{{\bm{S}}} \def\mT{{\bm{T}}} \def\mU{{\bm{U}}} \def\mV{{\bm{V}}} \def\mW{{\bm{W}}} \def\mX{{\bm{X}}} \def\mY{{\bm{Y}}} \def\mZ{{\bm{Z}}} \def\mBeta{{\bm{\beta}}} \def\mPhi{{\bm{\Phi}}} \def\mLambda{{\bm{\Lambda}}} \def\mSigma{{\bm{\Sigma}}} \newcommand{\tens}[1]{\bm{\mathsfit{#1}}} \def\tA{{\tens{A}}} \def\tB{{\tens{B}}} \def\tC{{\tens{C}}} \def\tD{{\tens{D}}} \def\tE{{\tens{E}}} \def\tF{{\tens{F}}} \def\tG{{\tens{G}}} \def\tH{{\tens{H}}} \def\tI{{\tens{I}}} \def\tJ{{\tens{J}}} \def\tK{{\tens{K}}} \def\tL{{\tens{L}}} \def\tM{{\tens{M}}} \def\tN{{\tens{N}}} \def\tO{{\tens{O}}} \def\tP{{\tens{P}}} \def\tQ{{\tens{Q}}} \def\tR{{\tens{R}}} \def\tS{{\tens{S}}} \def\tT{{\tens{T}}} \def\tU{{\tens{U}}} \def\tV{{\tens{V}}} \def\tW{{\tens{W}}} \def\tX{{\tens{X}}} \def\tY{{\tens{Y}}} \def\tZ{{\tens{Z}}} \def\gA{{\mathcal{A}}} \def\gB{{\mathcal{B}}} \def\gC{{\mathcal{C}}} \def\gD{{\mathcal{D}}} \def\gE{{\mathcal{E}}} \def\gF{{\mathcal{F}}} \def\gG{{\mathcal{G}}} \def\gH{{\mathcal{H}}} \def\gI{{\mathcal{I}}} \def\gJ{{\mathcal{J}}} \def\gK{{\mathcal{K}}} \def\gL{{\mathcal{L}}} \def\gM{{\mathcal{M}}} \def\gN{{\mathcal{N}}} \def\gO{{\mathcal{O}}} \def\gP{{\mathcal{P}}} \def\gQ{{\mathcal{Q}}} \def\gR{{\mathcal{R}}} \def\gS{{\mathcal{S}}} \def\gT{{\mathcal{T}}} \def\gU{{\mathcal{U}}} \def\gV{{\mathcal{V}}} \def\gW{{\mathcal{W}}} \def\gX{{\mathcal{X}}} \def\gY{{\mathcal{Y}}} \def\gZ{{\mathcal{Z}}} \def\sA{{\mathbb{A}}} \def\sB{{\mathbb{B}}} \def\sC{{\mathbb{C}}} \def\sD{{\mathbb{D}}} \def\sF{{\mathbb{F}}} \def\sG{{\mathbb{G}}} \def\sH{{\mathbb{H}}} \def\sI{{\mathbb{I}}} \def\sJ{{\mathbb{J}}} \def\sK{{\mathbb{K}}} \def\sL{{\mathbb{L}}} \def\sM{{\mathbb{M}}} \def\sN{{\mathbb{N}}} \def\sO{{\mathbb{O}}} \def\sP{{\mathbb{P}}} \def\sQ{{\mathbb{Q}}} \def\sR{{\mathbb{R}}} \def\sS{{\mathbb{S}}} \def\sT{{\mathbb{T}}} \def\sU{{\mathbb{U}}} \def\sV{{\mathbb{V}}} \def\sW{{\mathbb{W}}} \def\sX{{\mathbb{X}}} \def\sY{{\mathbb{Y}}} \def\sZ{{\mathbb{Z}}} \def\emLambda{{\Lambda}} \def\emA{{A}} \def\emB{{B}} \def\emC{{C}} \def\emD{{D}} \def\emE{{E}} \def\emF{{F}} \def\emG{{G}} \def\emH{{H}} \def\emI{{I}} \def\emJ{{J}} \def\emK{{K}} \def\emL{{L}} \def\emM{{M}} \def\emN{{N}} \def\emO{{O}} \def\emP{{P}} \def\emQ{{Q}} \def\emR{{R}} \def\emS{{S}} \def\emT{{T}} \def\emU{{U}} \def\emV{{V}} \def\emW{{W}} \def\emX{{X}} \def\emY{{Y}} \def\emZ{{Z}} \def\emSigma{{\Sigma}} \newcommand{\etens}[1]{\mathsfit{#1}} \def\etLambda{{\etens{\Lambda}}} \def\etA{{\etens{A}}} \def\etB{{\etens{B}}} \def\etC{{\etens{C}}} \def\etD{{\etens{D}}} \def\etE{{\etens{E}}} \def\etF{{\etens{F}}} \def\etG{{\etens{G}}} \def\etH{{\etens{H}}} \def\etI{{\etens{I}}} \def\etJ{{\etens{J}}} \def\etK{{\etens{K}}} \def\etL{{\etens{L}}} \def\etM{{\etens{M}}} \def\etN{{\etens{N}}} \def\etO{{\etens{O}}} \def\etP{{\etens{P}}} \def\etQ{{\etens{Q}}} \def\etR{{\etens{R}}} \def\etS{{\etens{S}}} \def\etT{{\etens{T}}} \def\etU{{\etens{U}}} \def\etV{{\etens{V}}} \def\etW{{\etens{W}}} \def\etX{{\etens{X}}} \def\etY{{\etens{Y}}} \def\etZ{{\etens{Z}}} \newcommand{\pdata}{p_{\rm{data}}} \newcommand{\ptrain}{\hat{p}_{\rm{data}}} \newcommand{\Ptrain}{\hat{P}_{\rm{data}}} \newcommand{\pmodel}{p_{\rm{model}}} \newcommand{\Pmodel}{P_{\rm{model}}} \newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}} \newcommand{\pencode}{p_{\rm{encoder}}} \newcommand{\pdecode}{p_{\rm{decoder}}} \newcommand{\precons}{p_{\rm{reconstruct}}} \newcommand{\E}{\mathbb{E}} \newcommand{\Ls}{\mathcal{L}} \newcommand{\R}{\mathbb{R}} \newcommand{\emp}{\tilde{p}} \newcommand{\lr}{\alpha} \newcommand{\reg}{\lambda} \newcommand{\rect}{\mathrm{rectifier}} \newcommand{\softmax}{\mathrm{softmax}} \newcommand{\sigmoid}{\sigma} \newcommand{\softplus}{\zeta} \newcommand{\KL}{D_{\mathrm{KL}}} \newcommand{\Var}{\mathrm{Var}} \newcommand{\standarderror}{\mathrm{SE}} \newcommand{\Cov}{\mathrm{Cov}} \newcommand{\normlzero}{L^0} \newcommand{\normlone}{L^1} \newcommand{\normltwo}{L^2} \newcommand{\normlp}{L^p} \newcommand{\normmax}{L^\infty} \newcommand{\parents}{Pa} % See usage in notation.tex. Chosen to match Daphne's book. \DeclareMathOperator*{\argmax}{arg\,max} \DeclareMathOperator*{\argmin}{arg\,min} \DeclareMathOperator{\sign}{sign} \DeclareMathOperator{\Tr}{Tr} \let\ab\allowbreak \]

Searching Apple Mail inbox with FZF

2026-01-27

apple mail (1) fzf (1)

Apple mail search sucks¹. There was a time when it used to work, but now it does not work at all for my inboxes. I've considered switching to another mail client, but this is the app built by the manufacturer, and has some (actually one) advantage (of pre-filling confirmation codes from emails into the input field in safari). I also don't want to use another email client because I don't want to deal with syncing issues and/or maintaining two separate copies of my emails. I also don't want to install new tools because I already have too many. Really, all I need is the ability to pull up emails from the past and open it in Apple Mail. So I decided to try to put together a solution using software I already have installed -- namely FZF and Ripgrep.

FZF is a fuzzy finder that basically is a search portal into whatever scope you define. Ripgrep is the underlying search engine that FZF will use to search through files. Both are pretty fast and efficient. I use them but I'm not very familiar with either of them. So I've asked gemini/claude to write me a small utility script that will allow me to search through Apple Mail's local storage of emails (which are stored as .emlx files in ~/Library/Mail) and open the selected email in Apple Mail (so that I can collect the attachment or reply/forward). It seems to work well, so here it is:

First Try -- Raw Files

#!/bin/bash

# Check dependencies
if ! command -v fzf &> /dev/null || ! command -v rg &> /dev/null; then
    echo "Error: You need 'fzf' and 'ripgrep' installed."
    echo "Run: brew install fzf ripgrep"
    exit 1
fi

format_mail() {
    while read -r file; do
        [ -z "$file" ] && continue
        from=$(grep -m1 "^From:" "$file" 2>/dev/null | sed 's/^From: *//')
        to=$(grep -m1 "^To:" "$file" 2>/dev/null | sed 's/^To: *//')
        subj=$(grep -m1 "^Subject:" "$file" 2>/dev/null | sed 's/^Subject: *//')
        printf '%s\t%s → %s | %s\n' "$file" "${from:--}" "${to:--}" "${subj:--}"
    done
}
export -f format_mail

export MAIL_DIR="$HOME/Library/Mail"
echo "Loading emails from Apple Mail..."

if command -v bat &> /dev/null; then
    PREVIEW_CMD="bat --style=numbers --color=always {1}"
else
    PREVIEW_CMD="cat {1}"
fi

SELECTED_FILE=$(fzf \
    --height=100% \
    --border=none \
    --delimiter $'\t' \
    --with-nth 2 \
    --preview-window 'top:85%:wrap' \
    --preview "$PREVIEW_CMD" \
    --bind "start:reload:rg --files --glob '*.emlx' \$MAIL_DIR 2>/dev/null | format_mail" \
    --query "$1" \
| cut -f1)

# Open Logic
if [ -n "$SELECTED_FILE" ]; then
    echo "Opening in Apple Mail..."
    open "$SELECTED_FILE"
else
    echo "No email selected."
fi

This gets the job done. However, it is also slow for larger inboxes, and we cannot sort by dates or other fields.

Second version -- Custom index

The main source of slowness is that we are relying on ripgrep inspecting every single file every time we update the query. If we want to make this faster, we need some intermediate "cache" of the emails, at least with fields that we can search by. In other words, we need an index. So an index we shall build. We will just a simple sqlite database of a single table where each entry is an email, now accompanied by some more useful fields like date, from, to, subject and the body text. This now also allows us to sanitize the email body to remove encoded content (which is not useful for searching).

Before I tried this, I also looked into using apple's built in Spotlight search (mdfind). It's supposed to have its own robust indexing system that allows for the user to search the filesystem/apps. However, it doesn't seem to work on my Mail directory ($HOME/Library/Mail/**). I can get some file metadata, but I cannot seem to get the actual file contents from this. But I tried all this with gemini, so it is possible I missed something.

For this version, we now have two scripts. One is a python script that interacts with our index. Other is the same (simpler) bash script that will be the UI for us to use the python script with. Since I have uv installed on my system, I will use that to provide the dependencies/virtual environment for this script (so just keep it in a project.) I made a short post on uv and pixi here

import argparse
import datetime
import email
import glob
import os
import re
import sqlite3
import sys
import time
from email import policy
from email.parser import BytesParser

# Configuration
DB_PATH = os.path.expanduser("~/mail_index.db")
MAIL_ROOT = os.path.expanduser("~/Library/Mail")
SEARCH_PATTERN = os.path.join(MAIL_ROOT, "**/Data/**/Messages/*.emlx")


def get_db_connection():
    conn = sqlite3.connect(DB_PATH)
    conn.row_factory = sqlite3.Row
    return conn


def init_db():
    conn = get_db_connection()
    c = conn.cursor()

    # Main table for metadata
    c.execute("""
        CREATE TABLE IF NOT EXISTS emails (
            path TEXT PRIMARY KEY,
            message_id TEXT,
            date_ts INTEGER
        )
    """)

    # FTS5 Virtual table for fast full-text search
    # content='' means we don't duplicate data if we had a main table for body,
    # but here we keep the text inside FTS for simplicity.
    c.execute("""
        CREATE VIRTUAL TABLE IF NOT EXISTS emails_fts USING fts5(
            subject,
            sender,
            body,
            path UNINDEXED,
            date_display UNINDEXED
        )
    """)

    conn.commit()
    return conn


def parse_emlx(path):
    """
    Parses an .emlx file.
    Note: emlx files often start with a byte count integer on line 1,
    followed by the standard MIME content.
    """
    try:
        with open(path, "rb") as f:
            raw_data = f.read()
        newline_pos = raw_data.find(b"\n")
        if newline_pos != -1:
            first_line = raw_data[:newline_pos].strip()
            if first_line.isdigit():
                raw_data = raw_data[newline_pos + 1 :]

        msg = BytesParser(policy=policy.default).parsebytes(raw_data)

        subject = msg.get("subject", "(No Subject)")
        sender = msg.get("from", "(Unknown)")
        date_str = msg.get("date")

        # Parse Date to timestamp
        date_ts = 0
        if date_str:
            try:
                dt = email.utils.parsedate_to_datetime(date_str)
                date_ts = int(dt.timestamp())
            except:
                date_ts = 0

        # Extract Body
        body = ""
        if msg.is_multipart():
            for part in msg.walk():
                ctype = part.get_content_type()
                cdispo = str(part.get("Content-Disposition"))

                # skip attachments
                if "attachment" in cdispo:
                    continue

                if ctype == "text/plain":
                    try:
                        body += part.get_content() + "\n"
                    except:
                        pass
        else:
            try:
                body = msg.get_content()
            except:
                pass

        # Clean up body a bit
        body = " ".join(body.split())

        return {
            "path": path,
            "message_id": msg.get("message-id", path),
            "subject": subject,
            "sender": sender,
            "body": body,
            "date_ts": date_ts,
            "date_display": date_str or "Unknown Date",
        }
    except Exception as e:
        return None


def run_indexer():
    print("Checking for new emails...", file=sys.stderr)
    conn = get_db_connection()
    cursor = conn.cursor()

    # Get existing paths to avoid re-indexing
    cursor.execute("SELECT path FROM emails")
    existing_paths = set(row["path"] for row in cursor.fetchall())

    # Find files
    # recursive glob might be slow on huge mailboxes, but it's simple
    files = glob.glob(SEARCH_PATTERN, recursive=True)

    new_files = [f for f in files if f not in existing_paths]

    if not new_files:
        print("Index is up to date.", file=sys.stderr)
        return

    print(f"Indexing {len(new_files)} new emails...", file=sys.stderr)

    batch = []
    count = 0

    for f in new_files:
        data = parse_emlx(f)
        if data:
            # Prepare for emails table
            batch.append(
                (
                    data["path"],
                    data["message_id"],
                    data["date_ts"],
                    data["subject"],
                    data["sender"],
                    data["body"],
                    data["path"],  # for FTS
                    data["date_display"],
                )
            )
            count += 1

        if count % 100 == 0:
            sys.stderr.write(f"\rProcessed {count}/{len(new_files)}")

            # Insert into lookup table
            cursor.executemany(
                "INSERT INTO emails (path, message_id, date_ts) VALUES (?, ?, ?)",
                [(x[0], x[1], x[2]) for x in batch],
            )
            # Insert into FTS table
            cursor.executemany(
                "INSERT INTO emails_fts (subject, sender, body, path, date_display) VALUES (?, ?, ?, ?, ?)",
                [(x[3], x[4], x[5], x[6], x[7]) for x in batch],
            )
            conn.commit()
            batch = []

    # Final batch
    if batch:
        cursor.executemany(
            "INSERT INTO emails (path, message_id, date_ts) VALUES (?, ?, ?)",
            [(x[0], x[1], x[2]) for x in batch],
        )
        cursor.executemany(
            "INSERT INTO emails_fts (subject, sender, body, path, date_display) VALUES (?, ?, ?, ?, ?)",
            [(x[3], x[4], x[5], x[6], x[7]) for x in batch],
        )
        conn.commit()

    print(f"\nDone. Indexed {count} files.", file=sys.stderr)


def build_fts_query(user_query):
    """
    Translates user syntax:
      hello world "invoice pdf" !urgent
    To SQLite FTS syntax:
      hello AND world AND "invoice pdf" NOT urgent
    """
    if not user_query or not user_query.strip():
        return None

    # Simple tokenizer that respects quotes
    # This regex finds quoted strings OR non-whitespace chunks
    tokens = re.findall(r'(?:".*?"|\S+)', user_query)

    fts_parts = []

    for token in tokens:
        if token.startswith("!"):
            # Negation
            term = token[1:]
            if term:
                fts_parts.append(f"NOT {term}")
        else:
            if fts_parts:
                fts_parts.append("AND")
            fts_parts.append(token)

    return " ".join(fts_parts)


def run_search(query_str):
    conn = get_db_connection()
    cursor = conn.cursor()

    if not query_str:
        # Return latest 50 emails if no query
        sql = """
            SELECT e.path, f.subject, f.sender, f.date_display
            FROM emails e
            JOIN emails_fts f ON e.path = f.path
            ORDER BY e.date_ts DESC LIMIT 50
        """
        cursor.execute(sql)
    else:
        fts_query = build_fts_query(query_str)
        # We join with the main table to get the numeric timestamp for sorting
        sql = """
            SELECT e.path, snippet(emails_fts, 0, '[', ']', '...', 10) as subject, f.sender, f.date_display
            FROM emails_fts f
            JOIN emails e ON f.path = e.path
            WHERE emails_fts MATCH ?
            ORDER BY rank, e.date_ts DESC LIMIT 100
        """
        try:
            cursor.execute(sql, (fts_query,))
        except sqlite3.OperationalError:
            print("Invalid Query Syntax")
            return

    results = cursor.fetchall()

    for row in results:
        sender = (
            (row["sender"][:25] + "..")
            if len(row["sender"]) > 25
            else row["sender"].ljust(25)
        )
        print(f"{row['path']}\t{row['date_display'][:16]}\t{sender}\t{row['subject']}")


if __name__ == "__main__":
    init_db()

    parser = argparse.ArgumentParser()
    parser.add_argument("--index", action="store_true", help="Update the index")
    parser.add_argument("--query", type=str, help="Search query")

    args = parser.parse_args()

    if args.index:
        run_indexer()

    # If a query is provided (or empty string passed specifically), search.
    # Otherwise if just --index was passed, don't search.
    if args.query is not None:
        run_search(args.query)

And now the much simpler bash script:

#!/bin/bash

PYTHON_SCRIPT="$HOME/Documents/projects/search-mail/search_and_index.py"
PYTHON_SCRIPT_DIR="$HOME/Documents/projects/search-mail"
INDEXER="uv run --directory \"$PYTHON_SCRIPT_DIR\" \"$PYTHON_SCRIPT\""

# first check the index is good.
eval "$INDEXER --index"

INITIAL_QUERY=""

if command -v bat &>/dev/null; then
    PREVIEW_CMD="bat --style=numbers --color=always {1}"
else
    PREVIEW_CMD="cat {1}"
fi

eval "$INDEXER --query \"$INITIAL_QUERY\"" |
    fzf --delimiter "\t" --with-nth 2.. \
        --height=100% \
        --border=none \
        --preview-window 'top:85%:wrap' \
        --preview "$PREVIEW_CMD" \
        --ansi \
        --layout=reverse \
        --header "Search (Standard words AND, \"phrases\", !negation) | Enter to Open" \
        --bind "change:reload:python3 $PYTHON_SCRIPT --query {q}" \
        --disabled \
        --bind "enter:execute(open {1})+abort"

And here it is. We now have a simple indexing system that allows us to quickly search through our Apple Mail inbox using FZF. The indexer script will run every time the search is launched -- but it will only index new emails, so it should be pretty fast after the first run.

A full (and probably improved) version of this code can be found in this repo

https://www.bugsappleloves.com ↩

← Installing packages on a machine without sudo using conda

← Back to all tech-tipss