枕を欹てて聴く

香炉峰の雪は簾を撥げて看る

Selection内のHTML Textをいい感じに取得する

追記:
Rangeを使うと, 86行に及ぶDOM操作がなんと2行に!
勝手に添削: Selection内のHTML Textをいい感じに取得する: Days on the Moon


Tomblooの最新版ではEvernoteの取り込み機能がつき, これに伴いHTML Textを取得する周りで大規模な変更がありました.
これのなかで nsIDocumentEncoder などのXPCOMが使われているのですが, Taberareloo移植の際, もちろんXPCOMなんてものはChromeにはありません.
それで, 書いてみました.

function convertToHTMLString(src, safe){

  var doc  = src.ownerDocument || src.focusNode.ownerDocument;

  if(src.focusNode){
    // selection

    // common parent node search
    var current,
        anchorOffset = src.anchorOffset,
        focusOffset = src.focusOffset;
    var anchorAnc = [src.anchorNode];
    current = src.anchorNode;
    while(document !== (current = current.parentNode) && current)
      anchorAnc.push(current);
    var focusAnc = [src.focusNode];
    current = src.focusNode;
    while(document !== (current = current.parentNode) && current)
      focusAnc.push(current);
    var common, aindex, findex;
    anchorAnc.some(function(item, index){
      if(~(findex = focusAnc.indexOf(item))){
        aindex = index;
        common = item;
        return true;
      } else {
        return false;
      }
    });
    // common配下nodeまで切捨て
    anchorAnc.length = aindex;
    focusAnc.length = findex;
    // commonから見た順番に変更
    focusAnc.reverse();
    anchorAnc.reverse();

    // indexに変換する => cloneNodeでcloneするので, 位置関係を保存する
    current = common;
    for(var i = 0, len = focusAnc.length; i < len; ++i){
      var node = focusAnc[i];
      focusAnc[i] = $A(current.childNodes).indexOf(node)
      current = node;
    }
    current = common;
    for(var i = 0, len = anchorAnc.length; i < len; ++i){
      var node = anchorAnc[i];
      anchorAnc[i] = $A(current.childNodes).indexOf(node)
      current = node;
    }
    // clone
    common = common.cloneNode(true);

    if(focusAnc[0] !== undefined){
      // common配下のindexを見て, focus と anchorがどちらが前方かを調べる
      // commonは最小の共通nodeであるので, すぐ下からindexが異なる
      if(focusAnc[0] < anchorAnc[0]){
        var t = focusAnc;
        focusAnc = anchorAnc;
        anchorAnc = t;
        t = focusOffset;
        focusOffset = anchorOffset;
        anchorOffset = t;
      }

      // focusに沿って後方をremove
      // 後方から削る => index調整不要
      current = common;
      focusAnc.forEach(function(obj){
        var children = $A(current.childNodes);
        for(var i = obj+1, len = children.length; i < len; i++){
          current.removeChild(children[i]);
        }
        current = children[obj];
      });
      if(current.nodeType === Node.TEXT_NODE){
        var val = current.textContent;
        if(val.length !== focusOffset){
          current.parentNode.replaceChild($T(val.substring(0, focusOffset)), current);
        }
      }
      // anchorに沿って前方をremove
      current = common;
      anchorAnc.forEach(function(obj){
        var children = $A(current.childNodes);
        for(var i = 0, len = obj; i < len; i++){
          current.removeChild(children[i]);
        }
        current = children[obj];
      });
      if(current.nodeType === Node.TEXT_NODE && anchorOffset){
        current.parentNode.replaceChild($T(current.textContent.substring(anchorOffset)), current);
      }
    } else {
      if(common.nodeType === Node.TEXT_NODE){
        // TextNode配下にnodeが来ることはないという仮定のもと成立
        // TextNode only
        if(focusOffset < anchorOffset){
          var t = focusOffset;
          focusOffset = anchorOffset;
          anchorOffset = t;
        }
        common = $T(common.textContent.substring(anchorOffset, focusOffset));
      }
    }
  } else {
    var common = src;
  }

  var html = (new XMLSerializer).serializeToString(common);
  if(!safe)
    return html;

  // DOMツリーに戻し不要な要素を除去する
  var root = doc.createElement('span');
  root.innerHTML = html;

  $X('.//*[contains(",' + convertToHTMLString.UNSAFE_ELEMENTS + ',", concat(",", local-name(.), ","))]', root).forEach(removeElement);
  $X('.//@*[not(contains(",' + convertToHTMLString.SAFE_ATTRIBUTES + ',", concat(",", local-name(.), ",")))]', root).forEach(function(attr){
    if(attr && attr.ownerElement) attr.ownerElement.removeAttribute(attr.name);
  });
  src = appendChildNodes($DF(), root.childNodes);

  // 再度HTML文字列へ変換する
  return convertToHTMLString(src);
}

update(convertToHTMLString , {
	UNSAFE_ELEMENTS : 'frame,script,style,frame,iframe',
	SAFE_ATTRIBUTES : 'action,cellpadding,cellspacing,checked,cite,clear,cols,colspan,content,coords,enctype,face,for,href,label,method,name,nohref,nowrap,rel,rows,rowspan,shape,span,src,style,target,type,usemap,value'
});

ちなみにTaberareloo内でのcodeなので, 以下のutility関数を利用しています.

function $A(arr){
  return Array.prototype.slice.call(arr);
};

var $T = function(mes){
  return document.createTextNode(mes);
};

var $DF = function(){
  return document.createDocumentFragment();
}

function update(t, s){
  if(s){
    Object.keys(s).forEach(function(key){
      t[key] = s[key];
    });
  }
  return t;
}

// http://gist.github.com/184276
// a little modified

// Cross Browser $X
// XPath 式中の接頭辞のない名前テストに接頭辞 prefix を追加する
// e.g. '//body[@class = "foo"]/p' -> '//prefix:body[@class = "foo"]/prefix:p'
// http://nanto.asablo.jp/blog/2008/12/11/4003371
function addDefaultPrefix(xpath, prefix) {
  var tokenPattern = /([A-Za-z_\u00c0-\ufffd][\w\-.\u00b7-\ufffd]*|\*)\s*(::?|\()?|(".*?"|'.*?'|\d+(?:\.\d*)?|\.(?:\.|\d+)?|[\)\]])|(\/\/?|!=|[<>]=?|[\(\[|,=+-])|([@$])/g;
  var TERM = 1, OPERATOR = 2, MODIFIER = 3;
  var tokenType = OPERATOR;
  prefix += ':';
  function replacer(token, identifier, suffix, term, operator, modifier) {
    if (suffix) {
      tokenType =
        (suffix === ':' || (suffix === '::' && (identifier === 'attribute' || identifier === 'namespace')))
        ? MODIFIER : OPERATOR;
    } else if (identifier) {
      if (tokenType === OPERATOR && identifier != '*')
        token = prefix + token;
      tokenType = (tokenType === TERM) ? OPERATOR : TERM;
    } else {
      tokenType = term ? TERM : operator ? OPERATOR : MODIFIER;
    }
    return token;
  }
  return xpath.replace(tokenPattern, replacer);
}

// $X on XHTML
// @target Freifox3, Chrome3, Safari4, Opera10
// @source http://gist.github.com/184276.txt
// a little modified ver
function $X (exp, context) {
  context || (context = document);
  var _document  = context.ownerDocument || context,
  documentElement = _document.documentElement,
  isXHTML = documentElement.tagName !== 'HTML' && _document.createElement('p').tagName === 'p',
  defaultPrefix = null;
  if (isXHTML) {
    defaultPrefix = '__default__';
    exp = addDefaultPrefix(exp, defaultPrefix);
  }
  function resolver (prefix) {
    return context.lookupNamespaceURI(prefix === defaultPrefix ? null : prefix) ||
         documentElement.namespaceURI || "";
  }
  function value(node){
    if(!node) return;

    switch(node.nodeType) {
      case Node.ELEMENT_NODE:
        return node;
      case Node.ATTRIBUTE_NODE:
      case Node.TEXT_NODE:
        return node.textContent;
    }
  }

  var result = _document.evaluate(exp, context, resolver, XPathResult.ANY_TYPE, null);
  switch (result.resultType) {
    case XPathResult.STRING_TYPE : return result.stringValue;
    case XPathResult.NUMBER_TYPE : return result.numberValue;
    case XPathResult.BOOLEAN_TYPE: return result.booleanValue;
    case XPathResult.UNORDERED_NODE_ITERATOR_TYPE:
      // not ensure the order.
      var ret = [], i = null;
      while (i = result.iterateNext()) ret.push(value(i));
      return ret;
  }
}

これを使うと
f:id:Constellation:20100203230720p:image
といった選択をしたときに,

convertToHTMLString(window.getSelection());

とすると

<div class="news"><div class="alert push"><div class="body"><div class="title"><a href="/shyouhei">uhei</a>
  <span>pushed</span> to
  trunk
  at
  <a href="http://github.com/shyouhei/ruby/tree/trunk">shyouhei/ruby</a>

  <abbr class="relatize relatized" title="2010-02-03 04:00:43">about an hour ago</abbr>
</div>

<div class="details">
  <div class="gravatar"><img alt="" height="30" src="https://secure.gravatar.com/avatar/9d2f78236e45a335301ba1195026105d?s=30&amp;d=https%3A%2F%2Fgithub.com%2Fimages%2Fgravatars%2Fgravatar-30.png" width="30"></div>
  HEAD is <a href="http://github.com/shyouhei/ruby/tree/aae19a5bee8525a9fbc10d11cc61e1da22064650">aae19a5bee8525a9fbc10d11cc61e1da22064650</a> 
  
  
  
    <a href="#" class="hide_commits" style="display:none">x</a>

    
      <p class="reveal">
        <a href="#" class="reveal_commits">2 commits ≫</a>
      </p>
    

    <img src="/images/modules/ajax/indicator.gif" class="spin" style="display:none;">
  

  <div class="commits" style="display:none;">
    <ul>
      
      
        
        
        <li class="first">
          

          <img alt="" height="16" src="https://secure.gravatar.com/avatar/364e8c965038ef5cfa4836bff8b6ff24?s=16&amp;d=https%3A%2F%2Fgithub.com%2Fimages%2Fgravatars%2Fgravatar-16.png" width="16">

          
            <span class="committer">akr</span>
          

          committed
          <a href="http://github.com/shyouhei/ruby/commit/aae19a5bee8525a9fbc10d11cc61e1da22064650">aae19a5b</a>:

          <div class="message">
            <blockquote title="* ext/iconv/charset_alias.rb: pass block argument to outer local
  variable.


git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@26552 b2dd03c8-39d4-4d8f-98ff-823fe69b080e">
              * ext/iconv/charset_alias.rb: pass block argument to outer local
            </blockquote>
          </div>
        </li>
      
        
        
        <li class="">
          

          <img alt="" height="16" src="https://secure.gravatar.com/avatar/364e8c965038ef5cfa4836bff8b6ff24?s=16&amp;d=https%3A%2F%2Fgithub.com%2Fimages%2Fgravatars%2Fgravatar-16.png" width="16">

          
            <span class="committer">akr</span>
          

          committed
          <a href="http://github.com/shyouhei/ruby/commit/7cf68d2ead0e2f2567fa5563a824f558f8bbf836">7cf68d2e</a>:

          <div class="message">
            <blockquote title="* file.c (realpath_rec): rb_path_last_separator may return NULL.


git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@26551 b2dd03c8-39d4-4d8f-98ff-823fe69b080e">
              * file.c (realpath_rec): rb_path_last_separator may return NULL.
            </blockquote>
          </div>
        </li>
      
      
    </ul>
  </div>
</div></div></div>
      
        <div class="alert watch_started"><div class="body"><div class="title">
  <a href="http://github.com/walf443">walf443</a>
  <span>started watching</span>
  <a href="http://github.com/kyo-ago/mist.js">kyo-ago/mist.js</a> 

  <abbr title="2010-02-03 03:39:35">
    <abbr class="relatize relatized" title="2010-02-03 03:39:35">about 2 hours ago</abbr>
  </abbr>
</div>
<div class="details">
  <div class="gravatar"><img alt="" height="30" src="https://secure.gravatar.com/avatar/dea4fe079aec284744ad02eb3d2594f5?s=30&amp;d=https%3A%2F%2Fgithub.com%2Fimages%2Fgravatars%2Fgravatar-30.png" width="30"></div>
  <div class="message">
    
      mist.js's description:
      <blockquote>
        mixiapp framework mist</blockquote></div></div></div></div></div>

というようなHTML Textが得られます. これをHTMLとして表示すると
f:id:Constellation:20100203230719p:image
となるような感じになっています. ちゃんと途中までしか選択していない部分は切れています.