From 699aa299f8ccb9d8121899694f002d79bd1a7afc Mon Sep 17 00:00:00 2001
From: Daniel Perez Alvarez <unindented@gmail.com>
Date: Mon, 13 Aug 2012 17:09:02 +0100
Subject: [PATCH] Normalize inserted text using UNorm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For some reason, the client was sending the server a Unicode-normalized
version of inserted strings. So if for example we inserted the string
'ä' (i.e. \x61\xCC\x88) into the document, what would be sent to the
server would be 'ä' (i.e. \xC3\xA4).

This wouldn't be a problem on its own. BUT JavaScript reports that the
length of the first string is 2, while the length of the second one is
1.

So the command that was being sent to the server was 'Z:1>2*0+1$ä', when
it should really be 'Z:1>1*0+1$ä'. When the `checkRep` method checks the
length of the inserted string, it finds an inconsistency, and
disconnects the client.

We now normalize the inserted string before the command is generated, so
the length is always correct.
---
 src/static/js/contentcollector.js | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/static/js/contentcollector.js b/src/static/js/contentcollector.js
index 777b3421c..b27dfc5e0 100644
--- a/src/static/js/contentcollector.js
+++ b/src/static/js/contentcollector.js
@@ -25,13 +25,14 @@
 
 var _MAX_LIST_LEVEL = 8;
 
+var UNorm = require('./unorm');
 var Changeset = require('./Changeset');
 var hooks = require('./pluginfw/hooks');
 var _ = require('./underscore');
 
 function sanitizeUnicode(s)
 {
-  return s.replace(/[\uffff\ufffe\ufeff\ufdd0-\ufdef\ud800-\udfff]/g, '?');
+  return UNorm.nfc(s).replace(/[\uffff\ufffe\ufeff\ufdd0-\ufdef\ud800-\udfff]/g, '?');
 }
 
 function makeContentCollector(collectStyles, browser, apool, domInterface, className2Author)