pad.libre-service.eu-etherpad/src/node/utils/ExportTxt.js

/**
 * Copyright 2009 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS-IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


var async = require("async");
var Changeset = require("ep_etherpad-lite/static/js/Changeset");
var padManager = require("../db/PadManager");
var ERR = require("async-stacktrace");
var Security = require('ep_etherpad-lite/static/js/security');
var hooks = require('ep_etherpad-lite/static/js/pluginfw/hooks');
function getPadPlainText(pad, revNum)
{
  var atext = ((revNum !== undefined) ? pad.getInternalRevisionAText(revNum) : pad.atext());
  var textLines = atext.text.slice(0, -1).split('\n');
  var attribLines = Changeset.splitAttributionLines(atext.attribs, atext.text);
  var apool = pad.pool();

  var pieces = [];
  for (var i = 0; i < textLines.length; i++)
  {
    var line = _analyzeLine(textLines[i], attribLines[i], apool);
    if (line.listLevel)
    {
      var numSpaces = line.listLevel * 2 - 1;
      var bullet = '*';
      pieces.push(new Array(numSpaces + 1).join(' '), bullet, ' ', line.text, '\n');
    }
    else
    {
      pieces.push(line.text, '\n');
    }
  }

  return pieces.join('');
}

function getPadTXT(pad, revNum, callback)
{
  var atext = pad.atext;
  var html;
  async.waterfall([
  // fetch revision atext


  function (callback)
  {
    if (revNum != undefined)
    {
      pad.getInternalRevisionAText(revNum, function (err, revisionAtext)
      {
        if(ERR(err, callback)) return;
        atext = revisionAtext;
        callback();
      });
    }
    else
    {
      callback(null);
    }
  },

  // convert atext to html


  function (callback)
  {
    html = getTXTFromAtext(pad, atext);
    callback(null);
  }],
  // run final callback


  function (err)
  {
    if(ERR(err, callback)) return;
    callback(null, html);
  });
}

exports.getPadTXT = getPadTXT;
exports.getTXTFromAtext = getTXTFromAtext;

function getTXTFromAtext(pad, atext, authorColors)
{
  var apool = pad.apool();
  var textLines = atext.text.slice(0, -1).split('\n');
  var attribLines = Changeset.splitAttributionLines(atext.attribs, atext.text);

  var tags = ['h1', 'h2', 'strong', 'em', 'u', 's'];
  var props = ['heading1', 'heading2', 'bold', 'italic', 'underline', 'strikethrough'];
  var anumMap = {};
  var css = "";

  props.forEach(function (propName, i)
  {
    var propTrueNum = apool.putAttrib([propName, true], true);
    if (propTrueNum >= 0)
    {
      anumMap[propTrueNum] = i;
    }
  });

  function getLineTXT(text, attribs)
  {
    var propVals = [false, false, false];
    var ENTER = 1;
    var STAY = 2;
    var LEAVE = 0;

    // Use order of tags (b/i/u) as order of nesting, for simplicity
    // and decent nesting.  For example,
    // <b>Just bold<b> <b><i>Bold and italics</i></b> <i>Just italics</i>
    // becomes
    // <b>Just bold <i>Bold and italics</i></b> <i>Just italics</i>
    var taker = Changeset.stringIterator(text);
    var assem = Changeset.stringAssembler();
    var openTags = [];

    var idx = 0;

    function processNextChars(numChars)
    {
      if (numChars <= 0)
      {
        return;
      }

      var iter = Changeset.opIterator(Changeset.subattribution(attribs, idx, idx + numChars));
      idx += numChars;

      while (iter.hasNext())
      {
        var o = iter.next();
        var propChanged = false;
        Changeset.eachAttribNumber(o.attribs, function (a)
        {
          if (a in anumMap)
          {
            var i = anumMap[a]; // i = 0 => bold, etc.
            if (!propVals[i])
            {
              propVals[i] = ENTER;
              propChanged = true;
            }
            else
            {
              propVals[i] = STAY;
            }
          }
        });
        for (var i = 0; i < propVals.length; i++)
        {
          if (propVals[i] === true)
          {
            propVals[i] = LEAVE;
            propChanged = true;
          }
          else if (propVals[i] === STAY)
          {
            propVals[i] = true; // set it back
          }
        }
        // now each member of propVal is in {false,LEAVE,ENTER,true}
        // according to what happens at start of span
        if (propChanged)
        {
          // leaving bold (e.g.) also leaves italics, etc.
          var left = false;
          for (var i = 0; i < propVals.length; i++)
          {
            var v = propVals[i];
            if (!left)
            {
              if (v === LEAVE)
              {
                left = true;
              }
            }
            else
            {
              if (v === true)
              {
                propVals[i] = STAY; // tag will be closed and re-opened
              }
            }
          }

          var tags2close = [];

          for (var i = propVals.length - 1; i >= 0; i--)
          {
            if (propVals[i] === LEAVE)
            {
              //emitCloseTag(i);
              tags2close.push(i);
              propVals[i] = false;
            }
            else if (propVals[i] === STAY)
            {
              //emitCloseTag(i);
              tags2close.push(i);
            }
          }
          
          for (var i = 0; i < propVals.length; i++)
          {
            if (propVals[i] === ENTER || propVals[i] === STAY)
            {
              emitOpenTag(i);
              propVals[i] = true;
            }
          }
          // propVals is now all {true,false} again
        } // end if (propChanged)
        var chars = o.chars;
        if (o.lines)
        {
          chars--; // exclude newline at end of line, if present
        }
        
        var s = taker.take(chars);
        
        //removes the characters with the code 12. Don't know where they come 
        //from but they break the abiword parser and are completly useless
        s = s.replace(String.fromCharCode(12), "");
        
        // assem.append(_encodeWhitespace(Security.escapeHTML(s)));
        assem.append(_encodeWhitespace(s));
      } // end iteration over spans in line
      
      var tags2close = [];
      for (var i = propVals.length - 1; i >= 0; i--)
      {
        if (propVals[i])
        {
          tags2close.push(i);
          propVals[i] = false;
        }
      }
      
    } // end processNextChars
    processNextChars(text.length - idx);

    return _processSpaces(assem.toString());
  } // end getLineHTML
  var pieces = [css];

  // Need to deal with constraints imposed on HTML lists; can
  // only gain one level of nesting at once, can't change type
  // mid-list, etc.
  // People might use weird indenting, e.g. skip a level,
  // so we want to do something reasonable there.  We also
  // want to deal gracefully with blank lines.
  // => keeps track of the parents level of indentation
  var lists = []; // e.g. [[1,'bullet'], [3,'bullet'], ...]
  for (var i = 0; i < textLines.length; i++)
  {
    var line = _analyzeLine(textLines[i], attribLines[i], apool);
    var lineContent = getLineTXT(line.text, line.aline);
    if(line.listTypeName == "bullet"){
      lineContent = "* " + lineContent; // add a bullet
    }
    if(line.listLevel > 0){
      for (var j = line.listLevel - 1; j >= 0; j--){
        pieces.push('\t');
      }
      if(line.listTypeName == "number"){
        pieces.push(line.listLevel + ". ");
        // This is bad because it doesn't truly reflect what the user
        // sees because browsers do magic on nested <ol><li>s
      }
      pieces.push(lineContent, '\n');
    }else{
      pieces.push(lineContent, '\n');
    }

    // I'm not too keen about using teh HTML export filters here, they could cause real pain in the future
    // I'd suggest supporting getLineTXTForExport
    var lineContentFromHook = hooks.callAllStr("getLineHTMLForExport", 
    {
      line: line,
      apool: apool,
      attribLine: attribLines[i],
      text: textLines[i]
    }, " ", " ", "");
    if (lineContentFromHook)
    {
      pieces.push(lineContentFromHook, '');
    } 
    else 
    {
      // pieces.push(lineContent, '\n');
    }
  }

  return pieces.join('');
}

function _analyzeLine(text, aline, apool)
{
  var line = {};

  // identify list
  var lineMarker = 0;
  line.listLevel = 0;
  if (aline)
  {
    var opIter = Changeset.opIterator(aline);
    if (opIter.hasNext())
    {
      var listType = Changeset.opAttributeValue(opIter.next(), 'list', apool);
      if (listType)
      {
        lineMarker = 1;
        listType = /([a-z]+)([12345678])/.exec(listType);
        if (listType)
        {
          line.listTypeName = listType[1];
          line.listLevel = Number(listType[2]);
        }
      }
    }
  }
  if (lineMarker)
  {
    line.text = text.substring(1);
    line.aline = Changeset.subattribution(aline, 1);
  }
  else
  {
    line.text = text;
    line.aline = aline;
  }

  return line;
}

exports.getPadTXTDocument = function (padId, revNum, noDocType, callback)
{
  padManager.getPad(padId, function (err, pad)
  {
    if(ERR(err, callback)) return;

    getPadTXT(pad, revNum, function (err, html)
    {
      if(ERR(err, callback)) return;
      callback(null, html);
    });
  });
}

function _encodeWhitespace(s) {
  return s.replace(/[^\x21-\x7E\s\t\n\r]/g, function(c)
  {
    return "&#" +c.charCodeAt(0) + ";"
  });
}

// copied from ACE
function _processSpaces(s)
{
  var doesWrap = true;
  if (s.indexOf("<") < 0 && !doesWrap)
  {
    // short-cut
    return s.replace(/ /g, '&nbsp;');
  }
  var parts = [];
  s.replace(/<[^>]*>?| |[^ <]+/g, function (m)
  {
    parts.push(m);
  });
  if (doesWrap)
  {
    var endOfLine = true;
    var beforeSpace = false;
    // last space in a run is normal, others are nbsp,
    // end of line is nbsp
    for (var i = parts.length - 1; i >= 0; i--)
    {
      var p = parts[i];
      if (p == " ")
      {
        if (endOfLine || beforeSpace) parts[i] = '&nbsp;';
        endOfLine = false;
        beforeSpace = true;
      }
      else if (p.charAt(0) != "<")
      {
        endOfLine = false;
        beforeSpace = false;
      }
    }
    // beginning of line is nbsp
    for (var i = 0; i < parts.length; i++)
    {
      var p = parts[i];
      if (p == " ")
      {
        parts[i] = '&nbsp;';
        break;
      }
      else if (p.charAt(0) != "<")
      {
        break;
      }
    }
  }
  else
  {
    for (var i = 0; i < parts.length; i++)
    {
      var p = parts[i];
      if (p == " ")
      {
        parts[i] = '&nbsp;';
      }
    }
  }
  return parts.join('');
}


// copied from ACE
var _REGEX_WORDCHAR = /[\u0030-\u0039\u0041-\u005A\u0061-\u007A\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF\u0100-\u1FFF\u3040-\u9FFF\uF900-\uFDFF\uFE70-\uFEFE\uFF10-\uFF19\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFDC]/;
var _REGEX_SPACE = /\s/;
begin support for better txt output 2013-02-10 18:34:34 +01:00			`/**`
			`* Copyright 2009 Google Inc.`
			`*`
			`* Licensed under the Apache License, Version 2.0 (the "License");`
			`* you may not use this file except in compliance with the License.`
			`* You may obtain a copy of the License at`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS-IS" BASIS,`
			`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*/`


			`var async = require("async");`
			`var Changeset = require("ep_etherpad-lite/static/js/Changeset");`
			`var padManager = require("../db/PadManager");`
			`var ERR = require("async-stacktrace");`
			`var Security = require('ep_etherpad-lite/static/js/security');`
			`var hooks = require('ep_etherpad-lite/static/js/pluginfw/hooks');`
			`function getPadPlainText(pad, revNum)`
			`{`
			`var atext = ((revNum !== undefined) ? pad.getInternalRevisionAText(revNum) : pad.atext());`
			`var textLines = atext.text.slice(0, -1).split('\n');`
			`var attribLines = Changeset.splitAttributionLines(atext.attribs, atext.text);`
			`var apool = pad.pool();`

			`var pieces = [];`
			`for (var i = 0; i < textLines.length; i++)`
			`{`
			`var line = _analyzeLine(textLines[i], attribLines[i], apool);`
			`if (line.listLevel)`
			`{`
			`var numSpaces = line.listLevel * 2 - 1;`
			`var bullet = '*';`
			`pieces.push(new Array(numSpaces + 1).join(' '), bullet, ' ', line.text, '\n');`
			`}`
			`else`
			`{`
			`pieces.push(line.text, '\n');`
			`}`
			`}`

			`return pieces.join('');`
			`}`

			`function getPadTXT(pad, revNum, callback)`
			`{`
			`var atext = pad.atext;`
			`var html;`
			`async.waterfall([`
			`// fetch revision atext`


			`function (callback)`
			`{`
			`if (revNum != undefined)`
			`{`
			`pad.getInternalRevisionAText(revNum, function (err, revisionAtext)`
			`{`
			`if(ERR(err, callback)) return;`
			`atext = revisionAtext;`
			`callback();`
			`});`
			`}`
			`else`
			`{`
			`callback(null);`
			`}`
			`},`

			`// convert atext to html`


			`function (callback)`
			`{`
			`html = getTXTFromAtext(pad, atext);`
			`callback(null);`
			`}],`
			`// run final callback`


			`function (err)`
			`{`
			`if(ERR(err, callback)) return;`
			`callback(null, html);`
			`});`
			`}`

			`exports.getPadTXT = getPadTXT;`
			`exports.getTXTFromAtext = getTXTFromAtext;`

			`function getTXTFromAtext(pad, atext, authorColors)`
			`{`
			`var apool = pad.apool();`
			`var textLines = atext.text.slice(0, -1).split('\n');`
			`var attribLines = Changeset.splitAttributionLines(atext.attribs, atext.text);`

			`var tags = ['h1', 'h2', 'strong', 'em', 'u', 's'];`
			`var props = ['heading1', 'heading2', 'bold', 'italic', 'underline', 'strikethrough'];`
			`var anumMap = {};`
			`var css = "";`

			`props.forEach(function (propName, i)`
			`{`
			`var propTrueNum = apool.putAttrib([propName, true], true);`
			`if (propTrueNum >= 0)`
			`{`
			`anumMap[propTrueNum] = i;`
			`}`
			`});`

			`function getLineTXT(text, attribs)`
			`{`
			`var propVals = [false, false, false];`
			`var ENTER = 1;`
			`var STAY = 2;`
			`var LEAVE = 0;`

			`// Use order of tags (b/i/u) as order of nesting, for simplicity`
			`// and decent nesting. For example,`
			`// <b>Just bold<b> <b><i>Bold and italics</i></b> <i>Just italics</i>`
			`// becomes`
			`// <b>Just bold <i>Bold and italics</i></b> <i>Just italics</i>`
			`var taker = Changeset.stringIterator(text);`
			`var assem = Changeset.stringAssembler();`
			`var openTags = [];`

			`var idx = 0;`

			`function processNextChars(numChars)`
			`{`
			`if (numChars <= 0)`
			`{`
			`return;`
			`}`

			`var iter = Changeset.opIterator(Changeset.subattribution(attribs, idx, idx + numChars));`
			`idx += numChars;`

			`while (iter.hasNext())`
			`{`
			`var o = iter.next();`
			`var propChanged = false;`
			`Changeset.eachAttribNumber(o.attribs, function (a)`
			`{`
			`if (a in anumMap)`
			`{`
			`var i = anumMap[a]; // i = 0 => bold, etc.`
			`if (!propVals[i])`
			`{`
			`propVals[i] = ENTER;`
			`propChanged = true;`
			`}`
			`else`
			`{`
			`propVals[i] = STAY;`
			`}`
			`}`
			`});`
			`for (var i = 0; i < propVals.length; i++)`
			`{`
			`if (propVals[i] === true)`
			`{`
			`propVals[i] = LEAVE;`
			`propChanged = true;`
			`}`
			`else if (propVals[i] === STAY)`
			`{`
			`propVals[i] = true; // set it back`
			`}`
			`}`
			`// now each member of propVal is in {false,LEAVE,ENTER,true}`
			`// according to what happens at start of span`
			`if (propChanged)`
			`{`
			`// leaving bold (e.g.) also leaves italics, etc.`
			`var left = false;`
			`for (var i = 0; i < propVals.length; i++)`
			`{`
			`var v = propVals[i];`
			`if (!left)`
			`{`
			`if (v === LEAVE)`
			`{`
			`left = true;`
			`}`
			`}`
			`else`
			`{`
			`if (v === true)`
			`{`
			`propVals[i] = STAY; // tag will be closed and re-opened`
			`}`
			`}`
			`}`

			`var tags2close = [];`

			`for (var i = propVals.length - 1; i >= 0; i--)`
			`{`
			`if (propVals[i] === LEAVE)`
			`{`
			`//emitCloseTag(i);`
			`tags2close.push(i);`
			`propVals[i] = false;`
			`}`
			`else if (propVals[i] === STAY)`
			`{`
			`//emitCloseTag(i);`
			`tags2close.push(i);`
			`}`
			`}`

			`for (var i = 0; i < propVals.length; i++)`
			`{`
			`if (propVals[i] === ENTER \|\| propVals[i] === STAY)`
			`{`
			`emitOpenTag(i);`
			`propVals[i] = true;`
			`}`
			`}`
			`// propVals is now all {true,false} again`
			`} // end if (propChanged)`
			`var chars = o.chars;`
			`if (o.lines)`
			`{`
			`chars--; // exclude newline at end of line, if present`
			`}`

			`var s = taker.take(chars);`

			`//removes the characters with the code 12. Don't know where they come`
			`//from but they break the abiword parser and are completly useless`
			`s = s.replace(String.fromCharCode(12), "");`

stop urls being encoded, not sure about other security implications here... 2013-02-10 20:21:27 +01:00			`// assem.append(_encodeWhitespace(Security.escapeHTML(s)));`
			`assem.append(_encodeWhitespace(s));`
begin support for better txt output 2013-02-10 18:34:34 +01:00			`} // end iteration over spans in line`

			`var tags2close = [];`
			`for (var i = propVals.length - 1; i >= 0; i--)`
			`{`
			`if (propVals[i])`
			`{`
			`tags2close.push(i);`
			`propVals[i] = false;`
			`}`
			`}`

			`} // end processNextChars`
			`processNextChars(text.length - idx);`

			`return _processSpaces(assem.toString());`
			`} // end getLineHTML`
			`var pieces = [css];`

			`// Need to deal with constraints imposed on HTML lists; can`
			`// only gain one level of nesting at once, can't change type`
			`// mid-list, etc.`
			`// People might use weird indenting, e.g. skip a level,`
			`// so we want to do something reasonable there. We also`
			`// want to deal gracefully with blank lines.`
			`// => keeps track of the parents level of indentation`
			`var lists = []; // e.g. [[1,'bullet'], [3,'bullet'], ...]`
			`for (var i = 0; i < textLines.length; i++)`
			`{`
			`var line = _analyzeLine(textLines[i], attribLines[i], apool);`
			`var lineContent = getLineTXT(line.text, line.aline);`
			`if(line.listTypeName == "bullet"){`
			`lineContent = "* " + lineContent; // add a bullet`
			`}`
			`if(line.listLevel > 0){`
			`for (var j = line.listLevel - 1; j >= 0; j--){`
			`pieces.push('\t');`
			`}`
			`if(line.listTypeName == "number"){`
			`pieces.push(line.listLevel + ". ");`
			`// This is bad because it doesn't truly reflect what the user`
			`// sees because browsers do magic on nested <ol><li>s`
			`}`
			`pieces.push(lineContent, '\n');`
			`}else{`
			`pieces.push(lineContent, '\n');`
			`}`

			`// I'm not too keen about using teh HTML export filters here, they could cause real pain in the future`
			`// I'd suggest supporting getLineTXTForExport`
			`var lineContentFromHook = hooks.callAllStr("getLineHTMLForExport",`
			`{`
			`line: line,`
			`apool: apool,`
			`attribLine: attribLines[i],`
			`text: textLines[i]`
			`}, " ", " ", "");`
			`if (lineContentFromHook)`
			`{`
			`pieces.push(lineContentFromHook, '');`
			`}`
			`else`
			`{`
			`// pieces.push(lineContent, '\n');`
			`}`
			`}`

			`return pieces.join('');`
			`}`

			`function _analyzeLine(text, aline, apool)`
			`{`
			`var line = {};`

			`// identify list`
			`var lineMarker = 0;`
			`line.listLevel = 0;`
			`if (aline)`
			`{`
			`var opIter = Changeset.opIterator(aline);`
			`if (opIter.hasNext())`
			`{`
			`var listType = Changeset.opAttributeValue(opIter.next(), 'list', apool);`
			`if (listType)`
			`{`
			`lineMarker = 1;`
			`listType = /([a-z]+)([12345678])/.exec(listType);`
			`if (listType)`
			`{`
			`line.listTypeName = listType[1];`
			`line.listLevel = Number(listType[2]);`
			`}`
			`}`
			`}`
			`}`
			`if (lineMarker)`
			`{`
			`line.text = text.substring(1);`
			`line.aline = Changeset.subattribution(aline, 1);`
			`}`
			`else`
			`{`
			`line.text = text;`
			`line.aline = aline;`
			`}`

			`return line;`
			`}`

			`exports.getPadTXTDocument = function (padId, revNum, noDocType, callback)`
			`{`
			`padManager.getPad(padId, function (err, pad)`
			`{`
			`if(ERR(err, callback)) return;`

			`getPadTXT(pad, revNum, function (err, html)`
			`{`
			`if(ERR(err, callback)) return;`
			`callback(null, html);`
			`});`
			`});`
			`}`

			`function _encodeWhitespace(s) {`
			`return s.replace(/[^\x21-\x7E\s\t\n\r]/g, function(c)`
			`{`
			`return "&#" +c.charCodeAt(0) + ";"`
			`});`
			`}`

			`// copied from ACE`
			`function _processSpaces(s)`
			`{`
			`var doesWrap = true;`
			`if (s.indexOf("<") < 0 && !doesWrap)`
			`{`
			`// short-cut`
			`return s.replace(/ /g, ' ');`
			`}`
			`var parts = [];`
			`s.replace(/<[^>]*>?\| \|[^ <]+/g, function (m)`
			`{`
			`parts.push(m);`
			`});`
			`if (doesWrap)`
			`{`
			`var endOfLine = true;`
			`var beforeSpace = false;`
			`// last space in a run is normal, others are nbsp,`
			`// end of line is nbsp`
			`for (var i = parts.length - 1; i >= 0; i--)`
			`{`
			`var p = parts[i];`
			`if (p == " ")`
			`{`
			`if (endOfLine \|\| beforeSpace) parts[i] = ' ';`
			`endOfLine = false;`
			`beforeSpace = true;`
			`}`
			`else if (p.charAt(0) != "<")`
			`{`
			`endOfLine = false;`
			`beforeSpace = false;`
			`}`
			`}`
			`// beginning of line is nbsp`
			`for (var i = 0; i < parts.length; i++)`
			`{`
			`var p = parts[i];`
			`if (p == " ")`
			`{`
			`parts[i] = ' ';`
			`break;`
			`}`
			`else if (p.charAt(0) != "<")`
			`{`
			`break;`
			`}`
			`}`
			`}`
			`else`
			`{`
			`for (var i = 0; i < parts.length; i++)`
			`{`
			`var p = parts[i];`
			`if (p == " ")`
			`{`
			`parts[i] = ' ';`
			`}`
			`}`
			`}`
			`return parts.join('');`
			`}`


			`// copied from ACE`
			`var _REGEX_WORDCHAR = /[\u0030-\u0039\u0041-\u005A\u0061-\u007A\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF\u0100-\u1FFF\u3040-\u9FFF\uF900-\uFDFF\uFE70-\uFEFE\uFF10-\uFF19\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFDC]/;`
			`var _REGEX_SPACE = /\s/;`