diff --git a/core/decoders/h264.js b/core/decoders/h264.js new file mode 100644 index 000000000..2587d61be --- /dev/null +++ b/core/decoders/h264.js @@ -0,0 +1,321 @@ +/* + * noVNC: HTML5 VNC client + * Copyright (C) 2024 The noVNC Authors + * Licensed under MPL 2.0 (see LICENSE.txt) + * + * See README.md for usage and integration instructions. + * + */ + +import * as Log from '../util/logging.js'; + +export class H264Parser { + constructor(data) { + this._data = data; + this._index = 0; + this.profileIdc = null; + this.constraintSet = null; + this.levelIdc = null; + } + + _getStartSequenceLen(index) { + let data = this._data; + if (data[index + 0] == 0 && data[index + 1] == 0 && data[index + 2] == 0 && data[index + 3] == 1) { + return 4; + } + if (data[index + 0] == 0 && data[index + 1] == 0 && data[index + 2] == 1) { + return 3; + } + return 0; + } + + _indexOfNextNalUnit(index) { + let data = this._data; + for (let i = index; i < data.length; ++i) { + if (this._getStartSequenceLen(i) != 0) { + return i; + } + } + return -1; + } + + _parseSps(index) { + this.profileIdc = this._data[index]; + this.constraintSet = this._data[index + 1]; + this.levelIdc = this._data[index + 2]; + } + + _parseNalUnit(index) { + const firstByte = this._data[index]; + if (firstByte & 0x80) { + throw new Error('H264 parsing sanity check failed, forbidden zero bit is set'); + } + const unitType = firstByte & 0x1f; + + switch (unitType) { + case 1: // coded slice, non-idr + return { slice: true }; + case 5: // coded slice, idr + return { slice: true, key: true }; + case 6: // sei + return {}; + case 7: // sps + this._parseSps(index + 1); + return {}; + case 8: // pps + return {}; + default: + Log.Warn("Unhandled unit type: ", unitType); + break; + } + return {}; + } + + parse() { + const startIndex = this._index; + let isKey = false; + + while (this._index < this._data.length) { + const startSequenceLen = this._getStartSequenceLen(this._index); + if (startSequenceLen == 0) { + throw new Error('Invalid start sequence in bit stream'); + } + + const { slice, key } = this._parseNalUnit(this._index + startSequenceLen); + + let nextIndex = this._indexOfNextNalUnit(this._index + startSequenceLen); + if (nextIndex == -1) { + this._index = this._data.length; + } else { + this._index = nextIndex; + } + + if (key) { + isKey = true; + } + if (slice) { + break; + } + } + + if (startIndex === this._index) { + return null; + } + + return { + frame: this._data.subarray(startIndex, this._index), + key: isKey, + }; + } +} + +export class H264Context { + constructor(width, height) { + this.lastUsed = 0; + this._width = width; + this._height = height; + this._profileIdc = null; + this._constraintSet = null; + this._levelIdc = null; + this._decoder = null; + this._pendingFrames = []; + } + + _handleFrame(frame) { + let pending = this._pendingFrames.shift(); + if (pending === undefined) { + throw new Error("Pending frame queue empty when receiving frame from decoder"); + } + + if (pending.timestamp != frame.timestamp) { + throw new Error("Video frame timestamp mismatch. Expected " + + frame.timestamp + " but but got " + pending.timestamp); + } + + pending.frame = frame; + pending.ready = true; + pending.resolve(); + + if (!pending.keep) { + frame.close(); + } + } + + _handleError(e) { + throw new Error("Failed to decode frame: " + e.message); + } + + _configureDecoder(profileIdc, constraintSet, levelIdc) { + if (this._decoder === null || this._decoder.state === 'closed') { + this._decoder = new VideoDecoder({ + output: frame => this._handleFrame(frame), + error: e => this._handleError(e), + }); + } + const codec = 'avc1.' + + profileIdc.toString(16).padStart(2, '0') + + constraintSet.toString(16).padStart(2, '0') + + levelIdc.toString(16).padStart(2, '0'); + this._decoder.configure({ + codec: codec, + codedWidth: this._width, + codedHeight: this._height, + optimizeForLatency: true, + }); + } + + _preparePendingFrame(timestamp) { + let pending = { + timestamp: timestamp, + promise: null, + resolve: null, + frame: null, + ready: false, + keep: false, + }; + pending.promise = new Promise((resolve) => { + pending.resolve = resolve; + }); + this._pendingFrames.push(pending); + + return pending; + } + + decode(payload) { + let parser = new H264Parser(payload); + let result = null; + + // Ideally, this timestamp should come from the server, but we'll just + // approximate it instead. + let timestamp = Math.round(window.performance.now() * 1e3); + + while (true) { + let encodedFrame = parser.parse(); + if (encodedFrame === null) { + break; + } + + if (parser.profileIdc !== null) { + self._profileIdc = parser.profileIdc; + self._constraintSet = parser.constraintSet; + self._levelIdc = parser.levelIdc; + } + + if (this._decoder === null || this._decoder.state !== 'configured') { + if (!encodedFrame.key) { + Log.Warn("Missing key frame. Can't decode until one arrives"); + continue; + } + if (self._profileIdc === null) { + Log.Warn('Cannot config decoder. Have not received SPS and PPS yet.'); + continue; + } + this._configureDecoder(self._profileIdc, self._constraintSet, + self._levelIdc); + } + + result = this._preparePendingFrame(timestamp); + + const chunk = new EncodedVideoChunk({ + timestamp: timestamp, + type: encodedFrame.key ? 'key' : 'delta', + data: encodedFrame.frame, + }); + + try { + this._decoder.decode(chunk); + } catch (e) { + Log.Warn("Failed to decode:", e); + } + } + + // We only keep last frame of each payload + if (result !== null) { + result.keep = true; + } + + return result; + } +} + +export default class H264Decoder { + constructor() { + this._tick = 0; + this._contexts = {}; + } + + _contextId(x, y, width, height) { + return [x, y, width, height].join(','); + } + + _findOldestContextId() { + let oldestTick = Number.MAX_VALUE; + let oldestKey = undefined; + for (const [key, value] of Object.entries(this._contexts)) { + if (value.lastUsed < oldestTick) { + oldestTick = value.lastUsed; + oldestKey = key; + } + } + return oldestKey; + } + + _createContext(x, y, width, height) { + const maxContexts = 64; + if (Object.keys(this._contexts).length >= maxContexts) { + let oldestContextId = this._findOldestContextId(); + delete this._contexts[oldestContextId]; + } + let context = new H264Context(width, height); + this._contexts[this._contextId(x, y, width, height)] = context; + return context; + } + + _getContext(x, y, width, height) { + let context = this._contexts[this._contextId(x, y, width, height)]; + return context !== undefined ? context : this._createContext(x, y, width, height); + } + + _resetContext(x, y, width, height) { + delete this._contexts[this._contextId(x, y, width, height)]; + } + + _resetAllContexts() { + this._contexts = {}; + } + + decodeRect(x, y, width, height, sock, display, depth) { + const resetContextFlag = 1; + const resetAllContextsFlag = 2; + + if (sock.rQwait("h264 header", 8)) { + return false; + } + + const length = sock.rQshift32(); + const flags = sock.rQshift32(); + + if (sock.rQwait("h264 payload", length, 8)) { + return false; + } + + if (flags & resetAllContextsFlag) { + this._resetAllContexts(); + } else if (flags & resetContextFlag) { + this._resetContext(x, y, width, height); + } + + let context = this._getContext(x, y, width, height); + context.lastUsed = this._tick++; + + if (length !== 0) { + let payload = sock.rQshiftBytes(length, false); + let frame = context.decode(payload); + if (frame !== null) { + display.videoFrame(x, y, width, height, frame); + } + } + + return true; + } +} diff --git a/core/display.js b/core/display.js index fcd626999..bc0bf2190 100644 --- a/core/display.js +++ b/core/display.js @@ -380,6 +380,17 @@ export default class Display { }); } + videoFrame(x, y, width, height, frame) { + this._renderQPush({ + 'type': 'frame', + 'frame': frame, + 'x': x, + 'y': y, + 'width': width, + 'height': height + }); + } + blitImage(x, y, width, height, arr, offset, fromQueue) { if (this._renderQ.length !== 0 && !fromQueue) { // NB(directxman12): it's technically more performant here to use preallocated arrays, @@ -406,9 +417,16 @@ export default class Display { } } - drawImage(img, x, y) { - this._drawCtx.drawImage(img, x, y); - this._damage(x, y, img.width, img.height); + drawImage(img, ...args) { + this._drawCtx.drawImage(img, ...args); + + if (args.length <= 4) { + const [x, y] = args; + this._damage(x, y, img.width, img.height); + } else { + const [,, sw, sh, dx, dy] = args; + this._damage(dx, dy, sw, sh); + } } autoscale(containerWidth, containerHeight) { @@ -511,6 +529,35 @@ export default class Display { ready = false; } break; + case 'frame': + if (a.frame.ready) { + // The encoded frame may be larger than the rect due to + // limitations of the encoder, so we need to crop the + // frame. + let frame = a.frame.frame; + if (frame.codedWidth < a.width || frame.codedHeight < a.height) { + Log.Warn("Decoded video frame does not cover its full rectangle area. Expecting at least " + + a.width + "x" + a.height + " but got " + + frame.codedWidth + "x" + frame.codedHeight); + } + const sx = 0; + const sy = 0; + const sw = a.width; + const sh = a.height; + const dx = a.x; + const dy = a.y; + const dw = sw; + const dh = sh; + this.drawImage(frame, sx, sy, sw, sh, dx, dy, dw, dh); + frame.close(); + } else { + let display = this; + a.frame.promise.then(() => { + display._scanRenderQ(); + }); + ready = false; + } + break; } if (ready) { diff --git a/core/encodings.js b/core/encodings.js index 1a79989d1..aa1fd4bbc 100644 --- a/core/encodings.js +++ b/core/encodings.js @@ -15,6 +15,7 @@ export const encodings = { encodingZRLE: 16, encodingTightPNG: -260, encodingJPEG: 21, + encodingH264: 50, pseudoEncodingQualityLevel9: -23, pseudoEncodingQualityLevel0: -32, @@ -44,6 +45,7 @@ export function encodingName(num) { case encodings.encodingZRLE: return "ZRLE"; case encodings.encodingTightPNG: return "TightPNG"; case encodings.encodingJPEG: return "JPEG"; + case encodings.encodingH264: return "H.264"; default: return "[unknown encoding " + num + "]"; } } diff --git a/core/rfb.js b/core/rfb.js index f2deb0e7b..9225cb464 100644 --- a/core/rfb.js +++ b/core/rfb.js @@ -10,7 +10,7 @@ import { toUnsigned32bit, toSigned32bit } from './util/int.js'; import * as Log from './util/logging.js'; import { encodeUTF8, decodeUTF8 } from './util/strings.js'; -import { dragThreshold } from './util/browser.js'; +import { dragThreshold, supportsWebCodecsH264Decode } from './util/browser.js'; import { clientToElement } from './util/element.js'; import { setCapture } from './util/events.js'; import EventTargetMixin from './util/eventtarget.js'; @@ -35,6 +35,7 @@ import TightDecoder from "./decoders/tight.js"; import TightPNGDecoder from "./decoders/tightpng.js"; import ZRLEDecoder from "./decoders/zrle.js"; import JPEGDecoder from "./decoders/jpeg.js"; +import H264Decoder from "./decoders/h264.js"; // How many seconds to wait for a disconnect to finish const DISCONNECT_TIMEOUT = 3; @@ -248,6 +249,7 @@ export default class RFB extends EventTargetMixin { this._decoders[encodings.encodingTightPNG] = new TightPNGDecoder(); this._decoders[encodings.encodingZRLE] = new ZRLEDecoder(); this._decoders[encodings.encodingJPEG] = new JPEGDecoder(); + this._decoders[encodings.encodingH264] = new H264Decoder(); // NB: nothing that needs explicit teardown should be done // before this point, since this can throw an exception @@ -2115,6 +2117,9 @@ export default class RFB extends EventTargetMixin { encs.push(encodings.encodingCopyRect); // Only supported with full depth support if (this._fbDepth == 24) { + if (supportsWebCodecsH264Decode) { + encs.push(encodings.encodingH264); + } encs.push(encodings.encodingTight); encs.push(encodings.encodingTightPNG); encs.push(encodings.encodingZRLE); diff --git a/core/util/browser.js b/core/util/browser.js index bbc9f5c1e..1ecded662 100644 --- a/core/util/browser.js +++ b/core/util/browser.js @@ -70,6 +70,26 @@ try { } export const hasScrollbarGutter = _hasScrollbarGutter; +export let supportsWebCodecsH264Decode = false; + +async function _checkWebCodecsH264DecodeSupport() { + if (!('VideoDecoder' in window)) { + return; + } + + // We'll need to make do with some placeholders here + const config = { + codec: 'avc1.42401f', + codedWidth: 1920, + codedHeight: 1080, + optimizeForLatency: true, + }; + + const result = await VideoDecoder.isConfigSupported(config); + supportsWebCodecsH264Decode = result.supported; +} +_checkWebCodecsH264DecodeSupport(); + /* * The functions for detection of platforms and browsers below are exported * but the use of these should be minimized as much as possible. diff --git a/tests/test.h264.js b/tests/test.h264.js new file mode 100644 index 000000000..42273e7ce --- /dev/null +++ b/tests/test.h264.js @@ -0,0 +1,264 @@ +import Websock from '../core/websock.js'; +import Display from '../core/display.js'; + +import { H264Parser } from '../core/decoders/h264.js'; +import H264Decoder from '../core/decoders/h264.js'; +import Base64 from '../core/base64.js'; + +import FakeWebSocket from './fake.websocket.js'; + +/* This is a 3 frame 16x16 video where the first frame is solid red, the second + * is solid green and the third is solid blue. + * + * The colour space is BT.709. It is encoded into the stream. + */ +const redGreenBlue16x16Video = new Uint8Array(Base64.decode( + 'AAAAAWdCwBTZnpuAgICgAAADACAAAAZB4oVNAAAAAWjJYyyAAAABBgX//4HcRem95tlIt5Ys' + + '2CDZI+7veDI2NCAtIGNvcmUgMTY0IHIzMTA4IDMxZTE5ZjkgLSBILjI2NC9NUEVHLTQgQVZD' + + 'IGNvZGVjIC0gQ29weWxlZnQgMjAwMy0yMDIzIC0gaHR0cDovL3d3dy52aWRlb2xhbi5vcmcv' + + 'eDI2NC5odG1sIC0gb3B0aW9uczogY2FiYWM9MCByZWY9NSBkZWJsb2NrPTE6MDowIGFuYWx5' + + 'c2U9MHgxOjB4MTExIG1lPWhleCBzdWJtZT04IHBzeT0xIHBzeV9yZD0xLjAwOjAuMDAgbWl4' + + 'ZWRfcmVmPTEgbWVfcmFuZ2U9MTYgY2hyb21hX21lPTEgdHJlbGxpcz0yIDh4OGRjdD0wIGNx' + + 'bT0wIGRlYWR6b25lPTIxLDExIGZhc3RfcHNraXA9MSBjaHJvbWFfcXBfb2Zmc2V0PS0yIHRo' + + 'cmVhZHM9MSBsb29rYWhlYWRfdGhyZWFkcz0xIHNsaWNlZF90aHJlYWRzPTAgbnI9MCBkZWNp' + + 'bWF0ZT0xIGludGVybGFjZWQ9MCBibHVyYXlfY29tcGF0PTAgY29uc3RyYWluZWRfaW50cmE9' + + 'MCBiZnJhbWVzPTAgd2VpZ2h0cD0wIGtleWludD1pbmZpbml0ZSBrZXlpbnRfbWluPTI1IHNj' + + 'ZW5lY3V0PTQwIGludHJhX3JlZnJlc2g9MCByY19sb29rYWhlYWQ9NTAgcmM9YWJyIG1idHJl' + + 'ZT0xIGJpdHJhdGU9NDAwIHJhdGV0b2w9MS4wIHFjb21wPTAuNjAgcXBtaW49MCBxcG1heD02' + + 'OSBxcHN0ZXA9NCBpcF9yYXRpbz0xLjQwIGFxPTE6MS4wMACAAAABZYiEBrxmKAAPVccAAS04' + + '4AA5DRJMnkycJk4TPwAAAAFBiIga8RigADVVHAAGaGOAANtuAAAAAUGIkBr///wRRQABVf8c' + + 'AAcho4AAiD4=')); + +let _haveH264Decode = null; + +async function haveH264Decode() { + if (_haveH264Decode !== null) { + return _haveH264Decode; + } + + if (!('VideoDecoder' in window)) { + _haveH264Decode = false; + return false; + } + + // We'll need to make do with some placeholders here + const config = { + codec: 'avc1.42401f', + codedWidth: 1920, + codedHeight: 1080, + optimizeForLatency: true, + }; + + _haveH264Decode = await VideoDecoder.isConfigSupported(config); + return _haveH264Decode; +} + +function createSolidColorFrameBuffer(color, width, height) { + const r = (color >> 24) & 0xff; + const g = (color >> 16) & 0xff; + const b = (color >> 8) & 0xff; + const a = (color >> 0) & 0xff; + + const size = width * height * 4; + let array = new Uint8ClampedArray(size); + + for (let i = 0; i < size / 4; ++i) { + array[i * 4 + 0] = r; + array[i * 4 + 1] = g; + array[i * 4 + 2] = b; + array[i * 4 + 3] = a; + } + + return array; +} + +function makeMessageHeader(length, resetContext, resetAllContexts) { + let flags = 0; + if (resetContext) { + flags |= 1; + } + if (resetAllContexts) { + flags |= 2; + } + + let header = new Uint8Array(8); + let i = 0; + + let appendU32 = (v) => { + header[i++] = (v >> 24) & 0xff; + header[i++] = (v >> 16) & 0xff; + header[i++] = (v >> 8) & 0xff; + header[i++] = v & 0xff; + }; + + appendU32(length); + appendU32(flags); + + return header; +} + +function wrapRectData(data, resetContext, resetAllContexts) { + let header = makeMessageHeader(data.length, resetContext, resetAllContexts); + return Array.from(header).concat(Array.from(data)); +} + +function testDecodeRect(decoder, x, y, width, height, data, display, depth) { + let sock; + let done = false; + + sock = new Websock; + sock.open("ws://example.com"); + + sock.on('message', () => { + done = decoder.decodeRect(x, y, width, height, sock, display, depth); + }); + + // Empty messages are filtered at multiple layers, so we need to + // do a direct call + if (data.length === 0) { + done = decoder.decodeRect(x, y, width, height, sock, display, depth); + } else { + sock._websocket._receiveData(new Uint8Array(data)); + } + + display.flip(); + + return done; +} + +function almost(a, b) { + let diff = Math.abs(a - b); + return diff < 5; +} + +describe('H.264 Parser', function () { + it('should parse constrained baseline video', function () { + let parser = new H264Parser(redGreenBlue16x16Video); + + let frame = parser.parse(); + expect(frame).to.have.property('key', true); + + expect(parser).to.have.property('profileIdc', 66); + expect(parser).to.have.property('constraintSet', 192); + expect(parser).to.have.property('levelIdc', 20); + + frame = parser.parse(); + expect(frame).to.have.property('key', false); + + frame = parser.parse(); + expect(frame).to.have.property('key', false); + + frame = parser.parse(); + expect(frame).to.be.null; + }); +}); + +describe('H.264 Decoder Unit Test', function () { + let decoder; + + beforeEach(async function () { + if (!await haveH264Decode()) { + this.skip(); + return; + } + decoder = new H264Decoder(); + }); + + it('creates and resets context', function () { + let context = decoder._getContext(1, 2, 3, 4); + expect(context._width).to.equal(3); + expect(context._height).to.equal(4); + expect(decoder._contexts).to.not.be.empty; + decoder._resetContext(1, 2, 3, 4); + expect(decoder._contexts).to.be.empty; + }); + + it('resets all contexts', function () { + decoder._getContext(0, 0, 1, 1); + decoder._getContext(2, 2, 1, 1); + expect(decoder._contexts).to.not.be.empty; + decoder._resetAllContexts(); + expect(decoder._contexts).to.be.empty; + }); + + it('caches contexts', function () { + let c1 = decoder._getContext(1, 2, 3, 4); + c1.lastUsed = 1; + let c2 = decoder._getContext(1, 2, 3, 4); + c2.lastUsed = 2; + expect(Object.keys(decoder._contexts).length).to.equal(1); + expect(c1.lastUsed).to.equal(c2.lastUsed); + }); + + it('deletes oldest context', function () { + for (let i = 0; i < 65; ++i) { + let context = decoder._getContext(i, 0, 1, 1); + context.lastUsed = i; + } + + expect(decoder._findOldestContextId()).to.equal('1,0,1,1'); + expect(decoder._contexts[decoder._contextId(0, 0, 1, 1)]).to.be.undefined; + expect(decoder._contexts[decoder._contextId(1, 0, 1, 1)]).to.not.be.null; + expect(decoder._contexts[decoder._contextId(63, 0, 1, 1)]).to.not.be.null; + expect(decoder._contexts[decoder._contextId(64, 0, 1, 1)]).to.not.be.null; + }); +}); + +describe('H.264 Decoder Functional Test', function () { + let decoder; + let display; + + before(FakeWebSocket.replace); + after(FakeWebSocket.restore); + + beforeEach(async function () { + if (!await haveH264Decode()) { + this.skip(); + return; + } + decoder = new H264Decoder(); + display = new Display(document.createElement('canvas')); + display.resize(16, 16); + }); + + it('should handle H.264 rect', async function () { + let data = wrapRectData(redGreenBlue16x16Video, false, false); + let done = testDecodeRect(decoder, 0, 0, 16, 16, data, display, 24); + expect(done).to.be.true; + await display.flush(); + let targetData = createSolidColorFrameBuffer(0x0000ffff, 16, 16); + expect(display).to.have.displayed(targetData, almost); + }); + + it('should handle specific context reset', async function () { + let data = wrapRectData(redGreenBlue16x16Video, false, false); + let done = testDecodeRect(decoder, 0, 0, 16, 16, data, display, 24); + expect(done).to.be.true; + await display.flush(); + let targetData = createSolidColorFrameBuffer(0x0000ffff, 16, 16); + expect(display).to.have.displayed(targetData, almost); + + data = wrapRectData([], true, false); + done = testDecodeRect(decoder, 0, 0, 16, 16, data, display, 24); + expect(done).to.be.true; + await display.flush(); + + expect(decoder._contexts[decoder._contextId(0, 0, 16, 16)]._decoder).to.be.null; + }); + + it('should handle global context reset', async function () { + let data = wrapRectData(redGreenBlue16x16Video, false, false); + let done = testDecodeRect(decoder, 0, 0, 16, 16, data, display, 24); + expect(done).to.be.true; + await display.flush(); + let targetData = createSolidColorFrameBuffer(0x0000ffff, 16, 16); + expect(display).to.have.displayed(targetData, almost); + + data = wrapRectData([], false, true); + done = testDecodeRect(decoder, 0, 0, 16, 16, data, display, 24); + expect(done).to.be.true; + await display.flush(); + + expect(decoder._contexts[decoder._contextId(0, 0, 16, 16)]._decoder).to.be.null; + }); +});