Let's implement a simple version step by step. You do not need any special library for the scratch version — just Python's standard library.
Step 1 — Normal (keyword count) search
python · normal_search.py
# ── Normal Search: just count matching words ──────────────────
def tokenize(text):
"""Lowercase and split into words."""
return text.lower().split()
def normal_score(document, query_tokens):
"""Count how many query words appear in the document."""
doc_tokens = tokenize(document)
score = 0
for word in query_tokens:
score += doc_tokens.count(word) # raw count — no limit
return score
# ── Example ───────────────────────────────────────────────────
documents = [
"python python python developer", # Doc A
"python developer roadmap guide", # Doc B
"developer", # Doc C
]
query = "python developer"
query_tokens = tokenize(query)
results = [
(doc, normal_score(doc, query_tokens))
for doc in documents
]
results.sort(key=lambda x: x[1], reverse=True)
for rank, (doc, score) in enumerate(results, 1):
print(f"{rank}. Score={score} → {doc}")
## Output:
## 1. Score=4 → python python python developer ← spammy doc wins 😤
## 2. Score=2 → python developer roadmap guide
## 3. Score=1 → developerStep 2 — BM25 from scratch
python · bm25_scratch.py
import math
from collections import Counter
# ── BM25 from scratch ─────────────────────────────────────────
def tokenize(text):
return text.lower().split()
class BM25:
def __init__(self, corpus, k1=1.5, b=0.75):
self.k1 = k1
self.b = b
self.tokenized_corpus = [tokenize(doc) for doc in corpus]
self.doc_count = len(corpus)
self.avg_dl = sum(len(d) for d in self.tokenized_corpus) / self.doc_count
self.tf = [Counter(d) for d in self.tokenized_corpus]
def _idf(self, word):
docs_with_word = sum(1 for tf in self.tf if word in tf)
if docs_with_word == 0:
return 0.0
return math.log(
(self.doc_count - docs_with_word + 0.5) /
(docs_with_word + 0.5) + 1
)
def score(self, doc_index, query_tokens):
tf = self.tf[doc_index]
dl = len(self.tokenized_corpus[doc_index])
total = 0.0
for word in query_tokens:
freq = tf.get(word, 0)
idf = self._idf(word)
numerator = freq * (self.k1 + 1)
denominator = freq + self.k1 * (1 - self.b + self.b * dl / self.avg_dl)
total += idf * (numerator / denominator)
return round(total, 4)
def rank(self, query):
tokens = tokenize(query)
scores = [(i, self.score(i, tokens)) for i in range(self.doc_count)]
return sorted(scores, key=lambda x: x[1], reverse=True)
# ── Example ───────────────────────────────────────────────────
documents = [
"python python python developer",
"python developer roadmap guide",
"developer",
]
bm25 = BM25(documents)
results = bm25.rank("python developer")
for rank, (idx, score) in enumerate(results, 1):
print(f"{rank}. BM25={score} → {documents[idx]}")
## Output:
## 1. BM25=1.0986 → python developer roadmap guide ← quality wins 🏆
## 2. BM25=0.8109 → python python python developer
## 3. BM25=0.4055 → developerStep 3 — Use the rank_bm25 library (recommended)
Install once: pip install rank-bm25 ·
PyPI ·
GitHub python · rank_bm25_library.py
from rank_bm25 import BM25Okapi
# Tokenise each document into a list of words
documents = [
"python python python developer",
"python developer roadmap guide",
"developer",
]
tokenized_docs = [doc.split() for doc in documents]
# Build the BM25 index
bm25 = BM25Okapi(tokenized_docs)
# Search
query = "python developer"
scores = bm25.get_scores(query.split())
for idx, score in enumerate(scores):
print(f"Doc {idx}: score={score:.4f} → {documents[idx]}")
## Doc 0: score=0.8109 → python python python developer
## Doc 1: score=1.0986 → python developer roadmap guide ← still wins ✅
## Doc 2: score=0.4055 → developer
# Get top-N results
top_docs = bm25.get_top_n(query.split(), documents, n=2)
print("
Top 2 results:")
for doc in top_docs:
print(" -", doc)Step 4 — Use BM25 in a RAG pipeline
python · bm25_rag_example.py
from rank_bm25 import BM25Okapi
# Your knowledge base (replace with real documents)
knowledge_base = [
"Python is a popular programming language for data science.",
"BM25 is a ranking function used in information retrieval.",
"RAG stands for Retrieval-Augmented Generation.",
"Elasticsearch uses BM25 as its default ranking algorithm.",
"Vector embeddings capture semantic meaning of text.",
]
def build_bm25_index(docs):
tokenized = [doc.lower().split() for doc in docs]
return BM25Okapi(tokenized)
def retrieve(query, docs, bm25_index, top_k=3):
"""Retrieve the top-k most relevant documents for the query."""
tokens = query.lower().split()
return bm25_index.get_top_n(tokens, docs, n=top_k)
# Build index once
index = build_bm25_index(knowledge_base)
# Retrieve at query time
user_query = "How does BM25 work in search?"
top_contexts = retrieve(user_query, knowledge_base, index)
print("Retrieved context for LLM:")
for i, ctx in enumerate(top_contexts, 1):
print(f" {i}. {ctx}")
# Then pass top_contexts to your LLM (OpenAI, Ollama, etc.) as context
Sign in with Google to add comments and replies.