Build a Search Index
Introduction
A search index is a data structure that maps terms to documents, enabling fast full-text search. It's the core technology behind search engines.
In this tutorial, we'll build "SearchIndex" - a complete search indexing system with tokenization, TF-IDF ranking, and query processing.
What You'll Build
- Inverted index data structure
- Text tokenization
- TF-IDF ranking algorithm
- Query processing
- Search API
Core Concepts
Understanding search indexing.
Index Types
- Inverted Index - Term → Documents mapping
- Forward Index - Document → Terms mapping
- Bitmap Index - Binary flags
Project Setup
Bash
mkdir search-index
cd search-index
python -m venv venv
source venv/bin/activate
pip install nltk rank-bm25
Inverted Index
Python
# index.py
from collections import defaultdict
import math
class InvertedIndex:
__init__(self):
self.index = defaultdict(set)
self.document_count = 0
self.documents = {}
self.doc_term_freq = defaultdict(lambda: defaultdict(int))
add_document(self, doc_id, content):
self.documents[doc_id] = content
self.document_count += 1
tokens = self._tokenize(content)
for token in tokens:
self.index[token].add(doc_id)
self.doc_term_freq[doc_id][token] += 1
_tokenize(self, text):
text = text.lower()
import re
tokens = re.findall(r'\w+', text)
return [t for t in tokens if len(t) > 2]
search(self, query):
query_tokens = self._tokenize(query)
if not query_tokens:
return []
results = defaultdict(float)
for token in query_tokens:
matching_docs = self.index.get(token, set())
idf = self._calculate_idf(token)
for doc_id in matching_docs:
tf = self.doc_term_freq[doc_id][token]
tf_idf = tf * idf
results[doc_id] += tf_idf
ranked = sorted(results.items(), key=lambda x: x[1], reverse=True)
return [(doc_id, score) for doc_id, score in ranked]
_calculate_idf(self, term):
df = len(self.index.get(term, set()))
if df == 0:
return 0
return math.log(self.document_count / df)
Tokenization
Python
# tokenizer.py
import re
from collections import Counter
class Tokenizer:
__init__(self, min_length=2, max_length=50):
self.min_length = min_length
self.max_length = max_length
self.stop_words = set([
'the', 'a', 'an', 'and', 'or',
'but', 'in', 'on', 'at',
'to', 'for', 'of', 'with'
])
tokenize(self, text):
text = text.lower()
text = re.sub(r'[^\w\s]', ' ', text)
tokens = text.split()
tokens = [t for t in tokens
if self.min_length <= len(t) <= self.max_length]
tokens = [t for t in tokens if t not in self.stop_words]
return tokens
stem(self, word):
suffixes = 'ing', 'ed', 'es', 's'
for suffix in suffixes:
if word.endswith(suffix) and len(word) > len(suffix) + 2:
return word[:-len(suffix)]
return
get_ngrams(self, text, n=2):
tokens = self.tokenize(text)
return [' '.join(tokens[i:i+n])
for i in range(len(tokens) - n + 1)]
TF-IDF Ranking
Python
# ranker.py
import math
from collections import Counter
class TFIDFRanker:
__init__(self):
self.doc_lengths = {}
self.avg_doc_length = 0
self.doc_count = 0
index_documents(self, documents):
self.doc_count = len(documents)
for doc_id, content in documents.items():
tokens = content.lower().split()
self.doc_lengths[doc_id] = len(tokens)
self.avg_doc_length = sum(self.doc_lengths.values()) / self.doc_count
calculate_tf(self, term, document):
tokens = document.lower().split()
term_count = tokens.count(term.lower())
if len(tokens) == 0:
return 0
return term_count / len(tokens)
calculate_idf(self, term, documents):
docs_with_term = sum(1 for doc in documents.values()
if term.lower() in doc.lower().split())
if docs_with_term == 0:
return 0
return math.log(self.doc_count / docs_with_term)
calculate_tf_idf(self, query, documents):
results = []
for doc_id, content in documents.items():
score = 0
for term in query.lower().split():
tf = self.calculate_tf(term, content)
idf = self.calculate_idf(term, documents)
score += tf * idf
results.append((doc_id, score))
return sorted(results, key=lambda x: x[1], reverse=True)
Query Processing
Python
# query.py
class QueryProcessor:
__init__(self, index):
self.index = index
process(self, query, options=None):
options = options or {}
query = query.strip()
if query.startswith('"') and query.endswith('"'):
return self._exact_phrase(query[1:-1])
if ' OR ' in query.upper():
return self._boolean_or(query)
if ' AND ' in query.upper():
return self._boolean_and(query)
if '-' in query:
return self._exclude(query)
return self.index.search(query)
_exact_phrase(self, phrase):
results = []
for doc_id, content in self.index.documents.items():
if phrase.lower() in content.lower():
results.append((doc_id, 1.0))
return results
_boolean_or(self, query):
terms = query.upper().split(' OR ')
all_results = set()
for term in terms:
results = self.index.search(term.strip())
all_results.update(r[0] for r in results)
return [(doc_id, 1.0) for doc_id in all_results]
_boolean_and(self, query):
terms = query.upper().split(' AND ')
result_sets = []
for term in terms:
results = self.index.search(term.strip())
result_sets.append(set(r[0] for r in results))
if not result_sets:
return []
common = result_sets[0]
for result_set in result_sets[1:]:
common = common.intersection(result_set)
return [(doc_id, 1.0) for doc_id in common]
_exclude(self, query):
exclude_terms = [t[1:] for t in query.split() if t.startswith('-']
include_terms = [t for t in query.split() if not t.startswith('-')]
results = self.index.search(' '.join(include_terms))
excluded = set()
for term in exclude_terms:
term_results = self.index.search(term)
excluded.update(r[0] for r in term_results)
return [(doc_id, score) for doc_id, score in results if doc_id not in excluded]
Testing
Python
# test_search.py
from index import InvertedIndex
index = InvertedIndex()
documents = {
'doc1': "Python is a great programming language",
'doc2': "JavaScript is also popular for web",
'doc3': "Machine learning with Python is powerful",
'doc4': "Web development with JavaScript frameworks"
}
for doc_id, content in documents.items():
index.add_document(doc_id, content)
# Basic search
results = index.search("Python")
print("Search 'Python':", results)
results = index.search("web programming")
print("Search 'web programming':", results)
Testing Checklist
- Documents are indexed correctly
- Search returns relevant results
- TF-IDF ranking works
- Boolean queries work
Summary
You've built a complete search indexing system.
What You Built
- Inverted Index - Fast term lookup
- Tokenizer - Text processing
- TF-IDF Ranking - Relevance scoring
- Query Processor - Boolean queries