Build a Simple Search Engine
Introduction
Search engines power the modern web, from Google to the search box on your favorite website. In this tutorial, we'll build a simple but functional search engine from scratch.
We'll implement key concepts like document indexing, tokenization, inverted indexes, and relevance ranking using TF-IDF.
- A document indexing system
- An inverted index for fast lookups
- TF-IDF ranking algorithm
- A simple web interface
- REST API for searching
- How search engines work internally
- Inverted index data structures
- TF-IDF ranking algorithm
- Text preprocessing and tokenization
- Building REST APIs with Flask
Project Overview
Our search engine will consist of three main components:
1. Indexer
Processes documents and creates an inverted index - a mapping from words to the documents containing them.
2. Search Engine
Takes user queries, looks up matching documents, and ranks them by relevance.
3. Web Interface
A simple Flask API and frontend for users to search documents.
Prerequisites
- Python installed - Version 3.8 or higher
- Code editor - VS Code recommended
- Basic Python knowledge
- Understanding of data structures
Project Setup
# Create project directory
mkdir simple-search-engine
cd simple-search-engine
# Create virtual environment
python -m venv venv
# Activate virtual environment
# Windows:
venv\Scripts\activate
# Mac/Linux:
source venv/bin/activate
# Install dependencies
pip install flask nltk
Project Structure
simple-search-engine/
├── search_engine/
│ ├── __init__.py
│ ├── indexer.py
│ ├── searcher.py
│ └── tokenizer.py
├── static/
│ └── index.html
├── app.py
├── documents.json
└── requirements.txt
Building the Index
The inverted index is the heart of any search engine. It maps each word to the documents that contain it.
# search_engine/indexer.py
import json
import math
from collections import defaultdict
from .tokenizer import Tokenizer
class Indexer:
def __init__(self):
self.inverted_index = defaultdict(set)
self.document_count = 0
self.doc_lengths = {}
self.doc_count = defaultdict(int)
self.tokenizer = Tokenizer()
def index_document(self, doc_id, content):
tokens = self.tokenizer.tokenize(content)
self.doc_lengths[doc_id] = len(tokens)
for token in tokens:
self.inverted_index[token].add(doc_id)
self.doc_count[doc_id] += 1
self.document_count += 1
def index_documents(self, documents):
for doc_id, content in documents.items():
self.index_document(doc_id, content)
def get_document_frequency(self, term):
return len(self.inverted_index[term])
def get_inverse_document_frequency(self, term):
if self.document_count == 0:
return 0
df = self.get_document_frequency(term)
return math.log(self.document_count / df)
def save_index(self, filename):
index_data = {
'inverted_index': {k: list(v) for k, v in self.inverted_index.items()},
'document_count': self.document_count,
'doc_lengths': self.doc_lengths
}
with open(filename, 'w') as f:
json.dump(index_data, f)
def load_index(self, filename):
with open(filename, 'r') as f:
index_data = json.load(f)
self.inverted_index = defaultdict(set, {
k: set(v) for k, v in index_data['inverted_index'].items()
})
self.document_count = index_data['document_count']
self.doc_lengths = index_data['doc_lengths']
Instead of storing documents as-is, we store a mapping from words to documents. This allows O(1) lookup time for search queries instead of scanning every document.
Search Algorithm
Now let's build the search functionality that uses the index to find relevant documents.
# search_engine/searcher.py
from collections import Counter
from .indexer import Indexer
class Searcher:
def __init__(self, indexer):
self.indexer = indexer
def search(self, query, top_k=10):
tokens = self.indexer.tokenizer.tokenize(query)
if not tokens:
return []
doc_scores = Counter()
for token in tokens:
if token in self.indexer.inverted_index:
idf = self.indexer.get_inverse_document_frequency(token)
matching_docs = self.indexer.inverted_index[token]
for doc_id in matching_docs:
tf = self._calculate_tf(doc_id, token)
tf_idf = tf * idf
doc_scores[doc_id] += tf_idf
ranked_docs = doc_scores.most_common(top_k)
return [(doc_id, score) for doc_id, score in ranked_docs]
def _calculate_tf(self, doc_id, term):
if doc_id not in self.indexer.doc_lengths:
return 0
doc_length = self.indexer.doc_lengths[doc_id]
if doc_length == 0:
return 0
matching_terms = self.indexer.inverted_index.get(term, set())
if doc_id not in matching_terms:
return 0
tf = 1 + self.doc_count.get(doc_id, 0)
normalized_tf = tf / doc_length
return 1 + math.log(normalized_tf) if normalized_tf > 0 else 0
@property
def doc_count(self):
return self.indexer.doc_count
Ranking Results
We'll use TF-IDF (Term Frequency - Inverse Document Frequency) to rank results by relevance.
# Improved ranking with normalization
import math
class AdvancedSearcher:
def __init__(self, indexer):
self.indexer = indexer
def search(self, query, top_k=10):
tokens = self.indexer.tokenizer.tokenize(query)
if not tokens:
return []
doc_vectors = {}
for token in tokens:
if token not in self.indexer.inverted_index:
continue
idf = self.indexer.get_inverse_document_frequency(token)
matching_docs = self.indexer.inverted_index[token]
for doc_id in matching_docs:
tf = self._get_term_frequency(doc_id, token)
weight = tf * idf
if doc_id not in doc_vectors:
doc_vectors[doc_id] = 0
doc_vectors[doc_id] += weight ** 2
query_vector = self._build_query_vector(tokens)
scores = []
for doc_id, doc_magnitude in doc_vectors.items():
if doc_magnitude > 0:
normalized_score = doc_vectors[doc_id] / doc_magnitude
normalized_score *= self._get_query_doc_overlap(tokens, doc_id)
scores.append((doc_id, normalized_score))
scores.sort(key=lambda x: x[1], reverse=True)
return scores[:top_k]
def _get_term_frequency(self, doc_id, term):
matching_docs = self.indexer.inverted_index.get(term, set())
if doc_id not in matching_docs:
return 0
doc_length = self.indexer.doc_lengths.get(doc_id, 1)
count = sum(1 for t in self.indexer.inverted_index[term] if t == doc_id)
return count / doc_length
def _build_query_vector(self, tokens):
return Counter(tokens)
def _get_query_doc_overlap(self, query_tokens, doc_id):
doc_terms = self.indexer.inverted_index
overlap = sum(1 for t in query_tokens if doc_id in doc_terms.get(t, set()))
return overlap / len(query_tokens)
- TF (Term Frequency) - How often a term appears in a document
- IDF (Inverse Document Frequency) - How rare/common a term is across all documents
- TF-IDF - High score for terms that appear frequently in a document but rarely in others
API Endpoints
Let's create the Flask API to serve our search engine.
# app.py
from flask import Flask, request, jsonify, render_template
import json
from search_engine.indexer import Indexer
from search_engine.searcher import Searcher
app = Flask(__name__)
documents = {}
indexer = Indexer()
searcher = Searcher(indexer)
def load_documents(filename='documents.json'):
global documents
with open(filename, 'r') as f:
documents = json.load(f)
indexer.index_documents(documents)
@app.route('/')
def index():
return render_template('index.html')
@app.route('/api/search')
def search():
query = request.args.get('q', '')
top_k = int(request.args.get('top', 10))
if not query:
return jsonify({'error': 'Query is required'}), 400
results = searcher.search(query, top_k)
formatted_results = []
for doc_id, score in results:
formatted_results.append({
'id': doc_id,
'title': documents[doc_id].get('title', doc_id),
'content': documents[doc_id].get('content', '')[:200],
'score': round(score, 4)
})
return jsonify({
'query': query,
'results': formatted_results,
'total': len(formatted_results)
})
@app.route('/api/documents', methods=['GET'])
def get_documents():
return jsonify(documents)
@app.route('/api/documents', methods=['POST'])
def add_document():
data = request.json
doc_id = data.get('id')
if not doc_id or doc_id in documents:
return jsonify({'error': 'Invalid or duplicate document ID'}), 400
content = data.get('content', '')
documents[doc_id] = {'title': data.get('title', ''), 'content': content}
indexer.index_document(doc_id, content)
return jsonify({'success': True, 'id': doc_id}), 201
if __name__ == '__main__':
load_documents()
app.run(debug=True, port=5000)
Building the Frontend
<!-- templates/index.html -->
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Search Engine</title>
<link rel="stylesheet" href="/static/styles.css">
</head>
<body>
<div class="container">
<h1>Search Engine</h1>
<div class="search-box">
<input type="text" id="searchInput" placeholder="Enter your search query...">
<button onclick="performSearch()">Search</button>
</div>
<div id="results"></div>
</div>
<script>
async function performSearch() {
const query = document.getElementById('searchInput').value;
if (!query) return;
const response = await fetch(\`/api/search?q=\${encodeURIComponent(query)}\`);
const data = await response.json();
displayResults(data.results);
}
function displayResults(results) {
const container = document.getElementById('results');
if (results.length === 0) {
container.innerHTML = '<p>No results found.</p>';
return;
}
container.innerHTML = results.map(result => \`
<div class="result-card">
<h3>\${result.title}</h3>
<p>\${result.content}...</p>
<span class="score">Score: \${result.score}</span>
</div>
\`).join('');
}
document.getElementById('searchInput').addEventListener('keypress', (e) => {
if (e.key === 'Enter') performSearch();
});
</script>
</body>
</html>
Testing
// documents.json
{
"doc1": {
"title": "Introduction to Python",
"content": "Python is a high-level programming language known for its simplicity and readability. It supports multiple programming paradigms including procedural, object-oriented, and functional programming."
},
"doc2": {
"title": "Web Development with Flask",
"content": "Flask is a lightweight WSGI web application framework in Python. It is designed to make getting started quick and easy, with the ability to scale up to complex applications."
},
"doc3": {
"title": "Machine Learning Basics",
"content": "Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience. It focuses on developing algorithms that can access data and use it to learn patterns."
},
"doc4": {
"title": "Data Science with Python",
"content": "Python has become the go-to language for data science. Libraries like pandas, numpy, and matplotlib make it powerful for data analysis and visualization."
}
}
# Start the server
python app.py
# Test with curl
curl "http://localhost:5000/api/search?q=python"
curl "http://localhost:5000/api/search?q=machine%20learning"
- ✓ Documents load and index correctly
- ✓ Search returns relevant results
- ✓ Results are ranked by relevance
- ✓ API endpoints work correctly
- ✓ Frontend displays results
Summary
Congratulations! You've built a complete search engine from scratch.
What You Built
- Inverted Index - Fast document lookup
- TF-IDF Ranking - Relevance scoring
- Flask API - REST endpoints
- Web Interface - User-friendly search
Next Steps
- Add stemming and lemmatization
- Implement phrase search
- Add fuzzy matching
- Support for boolean operators