Build a Search Index

Difficulty: Intermediate Est. Time: ~4 hours

Introduction

A search index is a data structure that maps terms to documents, enabling fast full-text search. It's the core technology behind search engines.

In this tutorial, we'll build "SearchIndex" - a complete search indexing system with tokenization, TF-IDF ranking, and query processing.

What You'll Build

Inverted index data structure
Text tokenization
TF-IDF ranking algorithm
Query processing
Search API

Core Concepts

Understanding search indexing.

Index Types

Inverted Index - Term â†’ Documents mapping
Forward Index - Document â†’ Terms mapping
Bitmap Index - Binary flags

Project Setup

Bash

mkdir search-index
cd search-index
python -m venv venv
source venv/bin/activate
pip install nltk rank-bm25

Inverted Index

Python

# index.py
from collections import defaultdict
import math

class InvertedIndex:
    __init__(self):
        self.index = defaultdict(set)
        self.document_count = 0
        self.documents = {}
        self.doc_term_freq = defaultdict(lambda: defaultdict(int))
    
    add_document(self, doc_id, content):
        self.documents[doc_id] = content
        self.document_count += 1
        
        tokens = self._tokenize(content)
        
        for token in tokens:
            self.index[token].add(doc_id)
            self.doc_term_freq[doc_id][token] += 1
    
    _tokenize(self, text):
        text = text.lower()
        import re
        tokens = re.findall(r'\w+', text)
        return [t for t in tokens if len(t) > 2]
    
    search(self, query):
        query_tokens = self._tokenize(query)
        
        if not query_tokens:
            return []
        
        results = defaultdict(float)
        
        for token in query_tokens:
            matching_docs = self.index.get(token, set())
            
            idf = self._calculate_idf(token)
            
            for doc_id in matching_docs:
                tf = self.doc_term_freq[doc_id][token]
                tf_idf = tf * idf
                results[doc_id] += tf_idf
        
        ranked = sorted(results.items(), key=lambda x: x[1], reverse=True)
        return [(doc_id, score) for doc_id, score in ranked]
    
    _calculate_idf(self, term):
        df = len(self.index.get(term, set()))
        
        if df == 0:
            return 0
        
        return math.log(self.document_count / df)

Tokenization

Python

# tokenizer.py
import re
from collections import Counter

class Tokenizer:
    __init__(self, min_length=2, max_length=50):
        self.min_length = min_length
        self.max_length = max_length
        self.stop_words = set([
            'the', 'a', 'an', 'and', 'or', 
            'but', 'in', 'on', 'at', 
            'to', 'for', 'of', 'with'
        ])
    
    tokenize(self, text):
        text = text.lower()
        
        text = re.sub(r'[^\w\s]', ' ', text)
        
        tokens = text.split()
        
        tokens = [t for t in tokens 
                  if self.min_length <= len(t) <= self.max_length]
        
        tokens = [t for t in tokens if t not in self.stop_words]
        
        return tokens
    
    stem(self, word):
        suffixes = 'ing', 'ed', 'es', 's'
        
        for suffix in suffixes:
            if word.endswith(suffix) and len(word) > len(suffix) + 2:
                return word[:-len(suffix)]
        
        return
    
    get_ngrams(self, text, n=2):
        tokens = self.tokenize(text)
        
        return [' '.join(tokens[i:i+n]) 
                for i in range(len(tokens) - n + 1)]

TF-IDF Ranking

Python

# ranker.py
import math
from collections import Counter

class TFIDFRanker:
    __init__(self):
        self.doc_lengths = {}
        self.avg_doc_length = 0
        self.doc_count = 0
    
    index_documents(self, documents):
        self.doc_count = len(documents)
        
        for doc_id, content in documents.items():
            tokens = content.lower().split()
            self.doc_lengths[doc_id] = len(tokens)
        
        self.avg_doc_length = sum(self.doc_lengths.values()) / self.doc_count
    
    calculate_tf(self, term, document):
        tokens = document.lower().split()
        term_count = tokens.count(term.lower())
        
        if len(tokens) == 0:
            return 0
        
        return term_count / len(tokens)
    
    calculate_idf(self, term, documents):
        docs_with_term = sum(1 for doc in documents.values() 
                           if term.lower() in doc.lower().split())
        
        if docs_with_term == 0:
            return 0
        
        return math.log(self.doc_count / docs_with_term)
    
    calculate_tf_idf(self, query, documents):
        results = []
        
        for doc_id, content in documents.items():
            score = 0
            
            for term in query.lower().split():
                tf = self.calculate_tf(term, content)
                idf = self.calculate_idf(term, documents)
                
                score += tf * idf
            
            results.append((doc_id, score))
        
        return sorted(results, key=lambda x: x[1], reverse=True)

Query Processing

Python

# query.py
class QueryProcessor:
    __init__(self, index):
        self.index = index
    
    process(self, query, options=None):
        options = options or {}
        
        query = query.strip()
        
        if query.startswith('"') and query.endswith('"'):
            return self._exact_phrase(query[1:-1])
        
        if ' OR ' in query.upper():
            return self._boolean_or(query)
        
        if ' AND ' in query.upper():
            return self._boolean_and(query)
        
        if '-' in query:
            return self._exclude(query)
        
        return self.index.search(query)
    
    _exact_phrase(self, phrase):
        results = []
        
        for doc_id, content in self.index.documents.items():
            if phrase.lower() in content.lower():
                results.append((doc_id, 1.0))
        
        return results
    
    _boolean_or(self, query):
        terms = query.upper().split(' OR ')
        
        all_results = set()
        
        for term in terms:
            results = self.index.search(term.strip())
            all_results.update(r[0] for r in results)
        
        return [(doc_id, 1.0) for doc_id in all_results]
    
    _boolean_and(self, query):
        terms = query.upper().split(' AND ')
        
        result_sets = []
        
        for term in terms:
            results = self.index.search(term.strip())
            result_sets.append(set(r[0] for r in results))
        
        if not result_sets:
            return []
        
        common = result_sets[0]
        for result_set in result_sets[1:]:
            common = common.intersection(result_set)
        
        return [(doc_id, 1.0) for doc_id in common]
    
    _exclude(self, query):
        exclude_terms = [t[1:] for t in query.split() if t.startswith('-']
        include_terms = [t for t in query.split() if not t.startswith('-')]
        
        results = self.index.search(' '.join(include_terms))
        
        excluded = set()
        for term in exclude_terms:
            term_results = self.index.search(term)
            excluded.update(r[0] for r in term_results)
        
        return [(doc_id, score) for doc_id, score in results if doc_id not in excluded]

Testing

Python

# test_search.py
from index import InvertedIndex

index = InvertedIndex()

documents = {
    'doc1': "Python is a great programming language",
    'doc2': "JavaScript is also popular for web",
    'doc3': "Machine learning with Python is powerful",
    'doc4': "Web development with JavaScript frameworks"
}

for doc_id, content in documents.items():
    index.add_document(doc_id, content)

# Basic search
results = index.search("Python")
print("Search 'Python':", results)

results = index.search("web programming")
print("Search 'web programming':", results)

Testing Checklist

Documents are indexed correctly
Search returns relevant results
TF-IDF ranking works
Boolean queries work

Summary

You've built a complete search indexing system.

What You Built

Inverted Index - Fast term lookup
Tokenizer - Text processing
TF-IDF Ranking - Relevance scoring
Query Processor - Boolean queries

Continue Learning

Build a Simple Search Engine
Build a Web Crawler