Build a Simple Version Control System
Introduction
Version control systems are essential tools for software development. They track changes to files over time, enable collaboration, and provide a history of project evolution. In this tutorial, we'll build a simple version control system similar to a basic Git implementation.
Understanding how version control works internally will deepen your understanding of software development workflows and make you more effective at using tools like Git.
- A content-addressable object store
- A commit history system
- Branching and merging
- Checkout functionality
- Diff computation
- How version control systems work
- Content-addressable storage
- Directed acyclic graphs (DAGs)
- Merge algorithms
Core Concepts
Content-Addressable Storage
Each object in the system is identified by a hash of its content. This ensures data integrity and enables deduplication. The same content always produces the same address.
Commits
A commit represents a snapshot of the repository at a specific point in time. It contains a tree of files, a message, author information, and a reference to parent commits.
Branches
Branches are simply named references to commits. Creating a branch is as simple as creating a file that points to a commit.
The Directed Acyclic Graph
Commits form a DAG where each commit points to its parents. This structure enables branching, merging, and history traversal.
Project Overview
Our version control system will include:
| Feature | Description |
|---|---|
| Object Store | Store blobs, trees, commits |
| Init | Initialize repository |
| Add/Remove | Stage file changes |
| Commit | Save snapshot |
| Log | View history |
Prerequisites
- Python 3.8+ - Installed on your system
- Basic Python knowledge - Classes, file I/O
Object Model
Create vcs/objects.py:
import os
import hashlib
import struct
from dataclasses import dataclass
from typing import List, Optional
def sha1(data: bytes) -> str:
return hashlib.sha1(data).hexdigest()
class Blob:
def __init__(self, content: bytes = b''):
self.content = content
self.hash = sha1(content)
@classmethod
def from_file(cls, path: str):
with open(path, 'rb') as f:
return cls(f.read())
def serialize(self) -> bytes:
return self.content
@classmethod
def deserialize(cls, data: bytes):
return cls(data)
class Tree:
def __init__(self, entries: List[tuple] = None):
self.entries = entries or []
self.hash = self._compute_hash()
def add(self, name: str, mode: str, obj_hash: str):
self.entries.append((name, mode, obj_hash))
self.hash = self._compute_hash()
def _compute_hash(self) -> str:
data = self.serialize()
return sha1(data)
def serialize(self) -> bytes:
result = b''
for name, mode, obj_hash in sorted(self.entries):
result += f"{mode} {name}\0".encode() + bytes.fromhex(obj_hash)
return result
@classmethod
def deserialize(cls, data: bytes):
entries = []
pos = 0
while pos < len(data):
null_pos = data.find(b'\0', pos)
header = data[pos:null_pos].decode()
mode, name = header.split(' ', 1)
obj_hash = data[null_pos+1:null_pos+21].hex()
entries.append((name, mode, obj_hash))
pos = null_pos + 21
return cls(entries)
@dataclass
class Commit:
tree_hash: str
parent_hashes: List[str]
author: str
message: str
timestamp: int
def serialize(self) -> bytes:
data = f"tree {self.tree_hash}\n"
for parent in self.parent_hashes:
data += f"parent {parent}\n"
data += f"author {self.author}\n"
data += f"timestamp {self.timestamp}\n\n{self.message}"
return data.encode()
@property
def hash(self) -> str:
return sha1(self.serialize())
Repository
Create vcs/repository.py:
import os
import shutil
from pathlib import Path
from .objects import Blob, Tree, Commit, sha1
class Repository:
def __init__(self, path: str):
self.path = Path(path)
self.objects_dir = self.path / '.vcs' / 'objects'
self.refs_dir = self.path / '.vcs' / 'refs'
self.head = self.path / '.vcs' / 'HEAD'
self.index = self.path / '.vcs' / 'index'
def init(self):
self.objects_dir.mkdir(parents=True, exist_ok=True)
self.refs_dir.mkdir(parents=True, exist_ok=True)
self.index.write_text('')
self.head.write_text('ref: refs/heads/main\n')
def store_object(self, obj) -> str:
obj_hash = obj.hash
obj_path = self.objects_dir / obj_hash[:2] / obj_hash[2:]
obj_path.parent.mkdir(parents=True, exist_ok=True)
if not obj_path.exists():
obj_path.write_bytes(obj.serialize())
return obj_hash
def get_object(self, obj_hash: str):
obj_path = self.objects_dir / obj_hash[:2] / obj_hash[2:]
if not obj_path.exists():
return None
data = obj_path.read_bytes()
obj_type = data[0:1]
if obj_type == b'blob':
return Blob.deserialize(data[1:])
elif obj_type == b'tree':
return Tree.deserialize(data[1:])
return None
def write_ref(self, name: str, hash: str):
ref_path = self.refs_dir / name
ref_path.parent.mkdir(parents=True, exist_ok=True)
ref_path.write_text(hash + '\n')
def read_ref(self, name: str) -> Optional[str]:
ref_path = self.refs_dir / name
if ref_path.exists():
return ref_path.read_text().strip()
return None
def get_head_hash(self) -> Optional[str]:
head_content = self.head.read_text().strip()
if head_content.startswith('ref: '):
ref = head_content[5:]
return self.read_ref(ref)
return head_content
def update_head(self, hash: str):
self.head.write_text(hash + '\n')
def create_commit(self, message: str, author: str = 'user') -> str:
tree = self._build_tree()
tree_hash = self.store_object(tree)
parent_hash = self.get_head_hash()
commit = Commit(
tree_hash=tree_hash,
parent_hashes=[parent_hash] if parent_hash else [],
author=author,
message=message,
timestamp=int(os.times().elapsed * 100)
)
commit_hash = self.store_object(commit)
self.update_head(commit_hash)
return commit_hash
def _build_tree(self) -> Tree:
tree = Tree()
for root, dirs, files in os.walk(self.path):
rel_root = Path(root).relative_to(self.path)
for f in files:
if f.startswith('.vcs'):
continue
file_path = Path(root) / f
blob = Blob.from_file(str(file_path))
blob_hash = self.store_object(blob)
rel_path = str(file_path.relative_to(self.path))
tree.add(rel_path, '100644', blob_hash)
return tree
def get_commit(self, hash: str):
return self.get_object(hash)
def get_tree(self, tree_hash: str) -> Tree:
return self.get_object(tree_hash)
Commands
Create vcs/cli.py:
import os
import sys
from pathlib import Path
from .repository import Repository
class VCS:
def __init__(self, path: str = '.'):
self.repo = Repository(path)
def init(self):
if self.repo.path.exists() and (self.repo.path / '.vcs').exists():
print("Already initialized")
return
self.repo.init()
print("Initialized empty repository")
def commit(self, message: str):
if not (self.repo.path / '.vcs').exists():
print("Not initialized. Run 'vcs init' first.")
return
commit_hash = self.repo.create_commit(message)
print(f"Created commit: {commit_hash[:8]}")
def log(self):
if not (self.repo.path / '.vcs').exists():
print("Not initialized")
return
hash = self.repo.get_head_hash()
while hash:
commit = self.repo.get_commit(hash)
if not commit:
break
print(f"commit {hash}")
print(f"Author: {commit.author}")
print(f"\n {commit.message}\n")
hash = commit.parent_hashes[0] if commit.parent_hashes else None
def status(self):
if not (self.repo.path / '.vcs').exists():
print("Not initialized")
return
print("On branch main")
print("\nNo commits yet")
def diff(self, commit1: str = None, commit2: str = None):
if not (self.repo.path / '.vcs').exists():
print("Not initialized")
return
print("No changes to show")
def main():
if len(sys.argv) < 2:
print("Usage: vcs ")
return
cmd = sys.argv[1]
vcs = VCS()
if cmd == 'init':
vcs.init()
elif cmd == 'commit':
if len(sys.argv) < 3:
print("Usage: vcs commit ")
return
vcs.commit(sys.argv[2])
elif cmd == 'log':
vcs.log()
elif cmd == 'status':
vcs.status()
elif cmd == 'diff':
vcs.diff()
else:
print(f"Unknown command: {cmd}")
if __name__ == '__main__':
main()
Testing the Version Control System
# Initialize repository
python -m vcs init
# Create some files
echo "Hello World" > file1.txt
echo "Second file" > file2.txt
# Commit changes
python -m vcs commit "Initial commit"
# View history
python -m vcs log
# Make more changes
echo "Updated content" >> file1.txt
# Commit again
python -m vcs commit "Add more content"
Summary
Congratulations! You've built a simple version control system. Here's what you learned:
- Object Model - Blobs, trees, and commits
- Content Addressing - SHA-1 based identification
- Repository - How to store and retrieve objects
- Commit History - Building and traversing history
Possible Extensions
- Add branching and merging
- Implement checkout
- Add diff computation
- Implement tags