Aidan's Corner of the Web

Interactive UTF-8 Decoder Demo

This decoder is part of my series on decoding UTF-8

To help illustrate my explanation, I've written an interactive decoder below. First, enter in a series of bytes in hexadecimal. Then, press the "Decode" button. The resulting output (or error) will be displayed in the output box. For example, entering "F2 80 9F A2" will result in the byte sequence <0xF2 0x80 0x9F 0xA2>. When decoded, it should result in U+807E2.

Output:
Press "Decode" to view output

Some Example Inputs and Outputs

41 → U+41
ED 9F 80 → U+D7C0
F4 80 80 8F → U+10000F
ED BF 80 → UTF8 Decode Error: Decoded surrogate near byte 4
C1 81 → UTF8 Decode Error: Overlong near byte 3
41 42 80 → UTF8 Decode Error: Invalid code unit at byte 4

Decoder Demo Source

// Adapted from Douglas Crockford's C UTF-8 decoder [0].
// [0]: https://github.com/douglascrockford/JSON-c/blob/master/utf8_decode.c

/**
 * Create a UTF8Decoder over a Uint8Array (or other integer array like
 * object), `buf`. Use `next()` to decode each character.
 */
function UTF8Decoder(buf) {
  this.input = buf;
  this.index = 0;
}

/** [PRIVATE] Get the next byte in the sequence */
UTF8Decoder.prototype._get = function() {
  if (this.done) {
    throw new Error('UTF8 Decode Error: truncated byte stream');
  }
  var c = this.input[this.index] & 0xFF;
  this.index += 1;
  return c;
};

/** [PRIVATE] get() expecting that the next byte is a continuation byte */
UTF8Decoder.prototype._cont = function() {
  var c = this._get();
  if ((c & 0xC0) == 0x80)
    return (c & 0x3F);

  throw new Error(`UTF8 Decode Error: bad continuation byte near ${this.index + 1} '${c.toString(16)}'`);
};

/** [PUBLIC] Returns true if the decoder has consumed the entire string */
Object.defineProperty(UTF8Decoder.prototype, 'done', {
  get: function() {
    return this.index >= this.input.length;
  }
});

/** [PUBLIC] Extract the next Unicode scalar value or throw a decode error */
UTF8Decoder.prototype.next = function() {
  var c0 = this._get();

  // One byte case
  if ((c0 & 0x80) == 0) {
      return c0;
  }

  // Two byte case
  if ((c0 & 0xE0) == 0xC0) {
    var c1 = this._cont();
    var r = ((c0 & 0x1F) << 6) | c1;

    if (r < 0x80) {
      throw new Error(`UTF8 Decode Error: Overlong near byte ${this.index + 1}`);
    }

    return r;
  }

  // Three byte case
  if ((c0 & 0xF0) == 0xE0) {
    var c1 = this._cont();
    var c2 = this._cont();
    var r = ((c0 & 0x0F) << 12) | (c1 << 6) | c2;

    if (r < 0x800) {
      throw new Error(`UTF8 Decode Error: Overlong near byte ${this.index + 1}`);
    }
    if (r >= 0xD800 && r <= 0xDFFF) {
      // Only shortest-form 3 byte UTF-8 sequences can encode surrogates.
      throw new Error(`UTF8 Decode Error: Decoded surrogate near byte ${this.index + 1}`);
    }

    return r;
  }

  // Four byte case
  if ((c0 & 0xF8) == 0xF0) {
    var c1 = this._cont();
    var c2 = this._cont();
    var c3 = this._cont();
    var r = ((c0 & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3;

    if (r < 0x10000) {
      throw new Error(`UTF8 Decode Error: Overlong near byte ${this.index + 1}`);
    }
    if (r > 0x10FFFF) {
      // Unicode scalar values can be at most U+10FFFF
      throw new Error(`UTF8 Decode Error: Decoded too large of a codepoint at byte ${this.index + 1}`);
    }

    return r;
  }

  // A value like 0x80 is never valid in a UTF-8 string
  throw new Error(`UTF8 Decode Error: Invalid code unit at byte ${this.index + 1}`);
};