Voxel File Format

TL;DR: I needed a smaller file format for my voxel models, and the resulting treasure chest model is small enough, that I actually embedded it in the HTML -- 840 bytes for the top (lid?), and 924 bytes for the bottom, down from 59,089 and 60,371 bytes respectively. Or just 1.47% of the .vox file size!

Imagine my surprise that even though there are so many image formats, I could only find the MagicaVoxel .vox file format as a sort-of standard file format for voxels. It's not like the two are very different -- images and voxels. One is a two dimensional store of picture elements, pixels, whereas the other is a three dimensional store of volumetric elements, voxels. Everything else about the two are the same.

So I decided to try my hand at my own format.

Now normally, I wouldn't hesitate using the standard file format, especially when my desire was to allow modding support and custom models using the voxel file format. Insert the obligatory reference to the wonderful XKCD comic on standards. But the size of those files were just too much of a drawback for me. I mean, they are very efficient at storing "empty space", but if you had a 100x100x100 cube, completely filled with elements, then that file becomes huge. Every non-empty element is depicted by its material, as well as its x, y, and z position. That's a lot of meta-information that can be inferred.

The most naive solution of just storing the data in a long C-style array, would have been better in that case. Nothing special would need to be done to parse the data (ignoring any endianness issues), just load it into a large buffer and you're good to go. Way easier on the CPU since no parsing has to be done, but terrible in terms of space and IO.

TODO: Specification.

Below I've attached some sample code to show how I've parsed the .ncvx file format.

Note: I made the stylistic decision of prepending all member properties that can be safely uglified with a leading underscore _. It just gives the minifier an easier time shrinking the code even smaller. This is a decision I made, and not something that anyone else has to follow to use this code.

// model.ts

export class ModelData {
    // Size
    _w: number;
    _h: number;
    _d: number;

    // Offset
    _x: number;
    _y: number;
    _z: number;

    _matl: [number, number, number][];
    _voxl: number[];

    constructor() {
        this._w = 0;
        this._h = 0;
        this._d = 0;

        this._x = 0;
        this._y = 0;
        this._z = 0;

        this._matl = [];
        this._voxl = [];
    }

    _get(x: number, y: number, z: number) {
        if (x < this._w && y < this._h && z < this._d) {
            return this._voxl[x + this._w * (y + this._h * (this._d - z - 1))];
        }
        return 0;
    }
}
// parse.ts

import {ModelData} from "./model";

export function parse(buf: ArrayBuffer) {
    const data = new DataView(buf);
    const sec = parseSections(data);

    const model = new ModelData();
    parseSize(model, data, sec._size);
    parseOffs(model, data, sec._offs);
    parseMatl(model, data, sec._matl);
    parseVoxl(model, data, sec._voxl);
    return model;
}

const NCVX = 2021024622; // 'ncvx'
const SIZE = 1702521203; // 'size'
const OFFS = 1936090735; // 'offs'
const MATL = 1819566445; // 'matl'
const VOXL = 1819832182; // 'voxl'

const MATL_RGB_BYTE = 258; // (1 << 8) | 2
const VOXL_4_12 = 1;
const VOXL_8_8 = 2;

interface Sections {
    _size: number;
    _offs: number;
    _matl: number;
    _voxl: number;
}

function parseSections(data: DataView): Sections {
    let fourcc = data.getUint32(0, true);
    if (fourcc !== NCVX) {
        throw new Error("wrong chunk header code");
    }

    const sec: Sections = {
        _size: 0,
        _offs: 0,
        _matl: 0,
        _voxl: 0,
    };

    let i = 2;
    while (4*i < data.byteLength) {
        fourcc = data.getUint32(4*i++, true);
        switch (fourcc) {
            case SIZE: sec._size = 4*i; break;
            case OFFS: sec._offs = 4*i; break;
            case MATL: sec._matl = 4*i; break;
            case VOXL: sec._voxl = 4*i; break;
        }

        let len = data.getUint32(4*i++, true);
        i += len;
    }

    return sec;
}

function parseSize(model: ModelData, data: DataView, sec: number) {
    if (sec === 0) {
        throw new Error("missing required chunk 'size'");
    }

    // Validate the section length prior to reading the data.
    const len = data.getUint32(sec, true);
    if (len !== 3) {
        throw new Error(`incorrect 'size' section length [${len}]; should be [3]`);
    }

    model._w = data.getUint32(sec + 4, true);
    model._h = data.getUint32(sec + 8, true);
    model._d = data.getUint32(sec + 12, true);
}

function parseOffs(model: ModelData, data: DataView, sec: number) {
    // The offset chunk is optional. If the section pointer is 0, skip this section.
    if (sec === 0) {
        return;
    }

    // Validate the section length prior to reading the data.
    const len = data.getUint32(sec, true);
    if (len !== 3) {
        throw new Error(`incorrect 'offs' section length [${len}]; should be [3]`);
    }

    // The offset is stored as a fixed point number. Multiply by the scale to get usable floating point numbers.
    const scale = 1 / (1 << 16);
    model._x = data.getUint32(sec + 4, true) * scale;
    model._y = data.getUint32(sec + 8, true) * scale;
    model._z = data.getUint32(sec + 12, true) * scale;
}

function parseMatl(model: ModelData, data: DataView, sec: number) {
    model._matl.length = 0;
    model._matl.push([0, 0, 0]);
    // The offset chunk is optional. If the section pointer is 0, skip this section.
    if (sec === 0) {
        return;
    }

    // Validate the section length prior to reading the data.
    const len = data.getUint32(sec, true);
    if (len === 0) {
        throw new Error(`incorrect 'matl' section length [${len}]; must be greater than [0]`);
    }

    const format = data.getUint32(sec + 4, true);
    if (format !== MATL_RGB_BYTE) {
        throw new Error(`unknown 'matl' format [${format}]`);
    }

    const start = sec + 4;
    for (let i = 1; i < len; ++i) {
        model._matl.push([
            data.getUint8(start + 4*i + 0) / 255,
            data.getUint8(start + 4*i + 1) / 255,
            data.getUint8(start + 4*i + 2) / 255,
        ]);
    }
}

function parseVoxl(model: ModelData, data: DataView, sec: number) {
    if (sec === 0 || model._w === 0 || model._h === 0 || model._d === 0) {
        model._voxl.length = 0;
        return;
    }

    // Validate the section length prior to reading the data.
    const len = data.getUint32(sec, true);
    if (len === 0) {
        throw new Error(`incorrect 'voxl' section length [${len}]; must be greater than [0]`);
    }

    const format = data.getUint32(sec + 4, true);
    if (format === VOXL_4_12) {
        return parseVoxl4_12(model, data, sec, len);
    }
    if (format === VOXL_8_8) {
        return parseVoxl8_8(model, data, sec, len);
    }
    throw new Error(`unknown 'voxl' format [${format}]`);
}

function parseVoxl4_12(model: ModelData, data: DataView, sec: number, len: number) {
    let voxelIndex = 0;
    let voxelCount = model._w * model._h * model._d;

    for (let i = 1; i < len && voxelIndex < voxelCount; ++i) {
        // Since <id, run> take two bytes, and length is in groups of 4, do this twice per iteration.
        {
            const val = data.getUint16(sec + 4 + 4*i + 0, true);
            const id  = val & ((1 << 4) - 1);
            const run = (val >> 4) & ((1 << 12) - 1);

            for (let j = 0; j < run && voxelIndex < voxelCount; ++j, ++voxelIndex) {
                model._voxl[voxelIndex] = id;
            }
        }

        {
            const val = data.getUint16(sec + 4 + 4*i + 2, true);
            const id  = val & ((1 << 4) - 1);
            const run = (val >> 4) & ((1 << 12) - 1);

            for (let j = 0; j < run && voxelIndex < voxelCount; ++j, ++voxelIndex) {
                model._voxl[voxelIndex] = id;
            }
        }
    }
}

function parseVoxl8_8(model: ModelData, data: DataView, sec: number, len: number) {
    let voxelIndex = 0;
    let voxelCount = model._w * model._h * model._d;

    for (let i = 1; i < len && voxelIndex < voxelCount; ++i) {
        // Since <id, run> take two bytes, and length is in groups of 4, do this twice per iteration.
        {
            const id  = data.getUint8(sec + 4 + 4*i + 0);
            const run = data.getUint8(sec + 4 + 4*i + 1);

            for (let j = 0; j < run && voxelIndex < voxelCount; ++j, ++voxelIndex) {
                model._voxl[voxelIndex] = id;
            }
        }

        {
            const id  = data.getUint8(sec + 4 + 4*i + 2);
            const run = data.getUint8(sec + 4 + 4*i + 3);

            for (let j = 0; j < run && voxelIndex < voxelCount; ++j, ++voxelIndex) {
                model._voxl[voxelIndex] = id;
            }
        }
    }
}