← Back to Tutorials
Python

Build a Simple Version Control System

Difficulty: Advanced Est. Time: ~5 hours

Introduction

Version control systems are essential tools for software development. They track changes to files over time, enable collaboration, and provide a history of project evolution. In this tutorial, we'll build a simple version control system similar to a basic Git implementation.

Understanding how version control works internally will deepen your understanding of software development workflows and make you more effective at using tools like Git.

What You'll Build
  • A content-addressable object store
  • A commit history system
  • Branching and merging
  • Checkout functionality
  • Diff computation
What You'll Learn
  • How version control systems work
  • Content-addressable storage
  • Directed acyclic graphs (DAGs)
  • Merge algorithms

Core Concepts

Content-Addressable Storage

Each object in the system is identified by a hash of its content. This ensures data integrity and enables deduplication. The same content always produces the same address.

Commits

A commit represents a snapshot of the repository at a specific point in time. It contains a tree of files, a message, author information, and a reference to parent commits.

Branches

Branches are simply named references to commits. Creating a branch is as simple as creating a file that points to a commit.

The Directed Acyclic Graph

Commits form a DAG where each commit points to its parents. This structure enables branching, merging, and history traversal.

Project Overview

Our version control system will include:

Feature Description
Object Store Store blobs, trees, commits
Init Initialize repository
Add/Remove Stage file changes
Commit Save snapshot
Log View history

Prerequisites

  • Python 3.8+ - Installed on your system
  • Basic Python knowledge - Classes, file I/O

Object Model

Create vcs/objects.py:

import os
import hashlib
import struct
from dataclasses import dataclass
from typing import List, Optional


def sha1(data: bytes) -> str:
    return hashlib.sha1(data).hexdigest()


class Blob:
    def __init__(self, content: bytes = b''):
        self.content = content
        self.hash = sha1(content)
    
    @classmethod
    def from_file(cls, path: str):
        with open(path, 'rb') as f:
            return cls(f.read())
    
    def serialize(self) -> bytes:
        return self.content
    
    @classmethod
    def deserialize(cls, data: bytes):
        return cls(data)


class Tree:
    def __init__(self, entries: List[tuple] = None):
        self.entries = entries or []
        self.hash = self._compute_hash()
    
    def add(self, name: str, mode: str, obj_hash: str):
        self.entries.append((name, mode, obj_hash))
        self.hash = self._compute_hash()
    
    def _compute_hash(self) -> str:
        data = self.serialize()
        return sha1(data)
    
    def serialize(self) -> bytes:
        result = b''
        for name, mode, obj_hash in sorted(self.entries):
            result += f"{mode} {name}\0".encode() + bytes.fromhex(obj_hash)
        return result
    
    @classmethod
    def deserialize(cls, data: bytes):
        entries = []
        pos = 0
        while pos < len(data):
            null_pos = data.find(b'\0', pos)
            header = data[pos:null_pos].decode()
            mode, name = header.split(' ', 1)
            obj_hash = data[null_pos+1:null_pos+21].hex()
            entries.append((name, mode, obj_hash))
            pos = null_pos + 21
        return cls(entries)


@dataclass
class Commit:
    tree_hash: str
    parent_hashes: List[str]
    author: str
    message: str
    timestamp: int
    
    def serialize(self) -> bytes:
        data = f"tree {self.tree_hash}\n"
        for parent in self.parent_hashes:
            data += f"parent {parent}\n"
        data += f"author {self.author}\n"
        data += f"timestamp {self.timestamp}\n\n{self.message}"
        return data.encode()
    
    @property
    def hash(self) -> str:
        return sha1(self.serialize())

Repository

Create vcs/repository.py:

import os
import shutil
from pathlib import Path
from .objects import Blob, Tree, Commit, sha1


class Repository:
    def __init__(self, path: str):
        self.path = Path(path)
        self.objects_dir = self.path / '.vcs' / 'objects'
        self.refs_dir = self.path / '.vcs' / 'refs'
        self.head = self.path / '.vcs' / 'HEAD'
        self.index = self.path / '.vcs' / 'index'
    
    def init(self):
        self.objects_dir.mkdir(parents=True, exist_ok=True)
        self.refs_dir.mkdir(parents=True, exist_ok=True)
        self.index.write_text('')
        self.head.write_text('ref: refs/heads/main\n')
    
    def store_object(self, obj) -> str:
        obj_hash = obj.hash
        obj_path = self.objects_dir / obj_hash[:2] / obj_hash[2:]
        obj_path.parent.mkdir(parents=True, exist_ok=True)
        
        if not obj_path.exists():
            obj_path.write_bytes(obj.serialize())
        return obj_hash
    
    def get_object(self, obj_hash: str):
        obj_path = self.objects_dir / obj_hash[:2] / obj_hash[2:]
        if not obj_path.exists():
            return None
        
        data = obj_path.read_bytes()
        obj_type = data[0:1]
        
        if obj_type == b'blob':
            return Blob.deserialize(data[1:])
        elif obj_type == b'tree':
            return Tree.deserialize(data[1:])
        
        return None
    
    def write_ref(self, name: str, hash: str):
        ref_path = self.refs_dir / name
        ref_path.parent.mkdir(parents=True, exist_ok=True)
        ref_path.write_text(hash + '\n')
    
    def read_ref(self, name: str) -> Optional[str]:
        ref_path = self.refs_dir / name
        if ref_path.exists():
            return ref_path.read_text().strip()
        return None
    
    def get_head_hash(self) -> Optional[str]:
        head_content = self.head.read_text().strip()
        if head_content.startswith('ref: '):
            ref = head_content[5:]
            return self.read_ref(ref)
        return head_content
    
    def update_head(self, hash: str):
        self.head.write_text(hash + '\n')
    
    def create_commit(self, message: str, author: str = 'user') -> str:
        tree = self._build_tree()
        tree_hash = self.store_object(tree)
        
        parent_hash = self.get_head_hash()
        commit = Commit(
            tree_hash=tree_hash,
            parent_hashes=[parent_hash] if parent_hash else [],
            author=author,
            message=message,
            timestamp=int(os.times().elapsed * 100)
        )
        
        commit_hash = self.store_object(commit)
        self.update_head(commit_hash)
        return commit_hash
    
    def _build_tree(self) -> Tree:
        tree = Tree()
        
        for root, dirs, files in os.walk(self.path):
            rel_root = Path(root).relative_to(self.path)
            
            for f in files:
                if f.startswith('.vcs'):
                    continue
                
                file_path = Path(root) / f
                blob = Blob.from_file(str(file_path))
                blob_hash = self.store_object(blob)
                
                rel_path = str(file_path.relative_to(self.path))
                tree.add(rel_path, '100644', blob_hash)
        
        return tree
    
    def get_commit(self, hash: str):
        return self.get_object(hash)
    
    def get_tree(self, tree_hash: str) -> Tree:
        return self.get_object(tree_hash)

Commands

Create vcs/cli.py:

import os
import sys
from pathlib import Path
from .repository import Repository


class VCS:
    def __init__(self, path: str = '.'):
        self.repo = Repository(path)
    
    def init(self):
        if self.repo.path.exists() and (self.repo.path / '.vcs').exists():
            print("Already initialized")
            return
        
        self.repo.init()
        print("Initialized empty repository")
    
    def commit(self, message: str):
        if not (self.repo.path / '.vcs').exists():
            print("Not initialized. Run 'vcs init' first.")
            return
        
        commit_hash = self.repo.create_commit(message)
        print(f"Created commit: {commit_hash[:8]}")
    
    def log(self):
        if not (self.repo.path / '.vcs').exists():
            print("Not initialized")
            return
        
        hash = self.repo.get_head_hash()
        while hash:
            commit = self.repo.get_commit(hash)
            if not commit:
                break
            
            print(f"commit {hash}")
            print(f"Author: {commit.author}")
            print(f"\n    {commit.message}\n")
            
            hash = commit.parent_hashes[0] if commit.parent_hashes else None
    
    def status(self):
        if not (self.repo.path / '.vcs').exists():
            print("Not initialized")
            return
        
        print("On branch main")
        print("\nNo commits yet")
    
    def diff(self, commit1: str = None, commit2: str = None):
        if not (self.repo.path / '.vcs').exists():
            print("Not initialized")
            return
        
        print("No changes to show")


def main():
    if len(sys.argv) < 2:
        print("Usage: vcs ")
        return
    
    cmd = sys.argv[1]
    vcs = VCS()
    
    if cmd == 'init':
        vcs.init()
    elif cmd == 'commit':
        if len(sys.argv) < 3:
            print("Usage: vcs commit ")
            return
        vcs.commit(sys.argv[2])
    elif cmd == 'log':
        vcs.log()
    elif cmd == 'status':
        vcs.status()
    elif cmd == 'diff':
        vcs.diff()
    else:
        print(f"Unknown command: {cmd}")


if __name__ == '__main__':
    main()

Testing the Version Control System

# Initialize repository
python -m vcs init

# Create some files
echo "Hello World" > file1.txt
echo "Second file" > file2.txt

# Commit changes
python -m vcs commit "Initial commit"

# View history
python -m vcs log

# Make more changes
echo "Updated content" >> file1.txt

# Commit again
python -m vcs commit "Add more content"

Summary

Congratulations! You've built a simple version control system. Here's what you learned:

  • Object Model - Blobs, trees, and commits
  • Content Addressing - SHA-1 based identification
  • Repository - How to store and retrieve objects
  • Commit History - Building and traversing history

Possible Extensions

  • Add branching and merging
  • Implement checkout
  • Add diff computation
  • Implement tags