sparklemotion/nokogiri

View on GitHub
ext/java/nokogiri/Html4SaxParserContext.java

Summary

Maintainability
A
1 hr
Test Coverage
package nokogiri;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.xerces.parsers.AbstractSAXParser;
import net.sourceforge.htmlunit.cyberneko.parsers.SAXParser;
import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyFixnum;
import org.jruby.RubyString;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.xml.sax.SAXException;

import nokogiri.internals.NokogiriHandler;
import static nokogiri.internals.NokogiriHelpers.rubyStringToString;

/**
 * Class for Nokogiri::HTML4::SAX::ParserContext.
 *
 * @author serabe
 * @author Patrick Mahoney <pat@polycrystal.org>
 * @author Yoko Harada <yokolet@gmail.com>
 */
@JRubyClass(name = "Nokogiri::HTML4::SAX::ParserContext", parent = "Nokogiri::XML::SAX::ParserContext")
public class Html4SaxParserContext extends XmlSaxParserContext
{
  private static final long serialVersionUID = 1L;

  static Html4SaxParserContext
  newInstance(final Ruby runtime, final RubyClass klazz)
  {
    Html4SaxParserContext instance = new Html4SaxParserContext(runtime, klazz);
    instance.initialize(runtime);
    return instance;
  }

  public
  Html4SaxParserContext(Ruby ruby, RubyClass rubyClass)
  {
    super(ruby, rubyClass);
  }

  @Override
  protected AbstractSAXParser
  createParser() throws SAXException
  {
    SAXParser parser = new SAXParser();

    try {
      parser.setProperty(
        "http://cyberneko.org/html/properties/names/elems", "lower");
      parser.setProperty(
        "http://cyberneko.org/html/properties/names/attrs", "lower");

      // NekoHTML should not try to guess the encoding based on the meta
      // tags or other information in the document.  This is already
      // handled by the EncodingReader.
      parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
      return parser;
    } catch (SAXException ex) {
      throw new SAXException(
        "Problem while creating HTML4 SAX Parser: " + ex.toString());
    }
  }

  @JRubyMethod(name = "memory", meta = true)
  public static IRubyObject
  parse_memory(ThreadContext context,
               IRubyObject klazz,
               IRubyObject data,
               IRubyObject encoding)
  {
    Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klazz);
    String javaEncoding = findEncodingName(context, encoding);
    if (javaEncoding != null) {
      CharSequence input = applyEncoding(rubyStringToString(data.convertToString()), javaEncoding);
      ByteArrayInputStream istream = new ByteArrayInputStream(input.toString().getBytes());
      ctx.setInputSource(istream);
      ctx.getInputSource().setEncoding(javaEncoding);
    }
    return ctx;
  }

  public enum EncodingType {
    NONE(0, "NONE"),
    UTF_8(1, "UTF-8"),
    UTF16LE(2, "UTF16LE"),
    UTF16BE(3, "UTF16BE"),
    UCS4LE(4, "UCS4LE"),
    UCS4BE(5, "UCS4BE"),
    EBCDIC(6, "EBCDIC"),
    UCS4_2143(7, "ICS4-2143"),
    UCS4_3412(8, "UCS4-3412"),
    UCS2(9, "UCS2"),
    ISO_8859_1(10, "ISO-8859-1"),
    ISO_8859_2(11, "ISO-8859-2"),
    ISO_8859_3(12, "ISO-8859-3"),
    ISO_8859_4(13, "ISO-8859-4"),
    ISO_8859_5(14, "ISO-8859-5"),
    ISO_8859_6(15, "ISO-8859-6"),
    ISO_8859_7(16, "ISO-8859-7"),
    ISO_8859_8(17, "ISO-8859-8"),
    ISO_8859_9(18, "ISO-8859-9"),
    ISO_2022_JP(19, "ISO-2022-JP"),
    SHIFT_JIS(20, "SHIFT-JIS"),
    EUC_JP(21, "EUC-JP"),
    ASCII(22, "ASCII");

    private final int value;
    private final String name;

    EncodingType(int value, String name)
    {
      this.value = value;
      this.name = name;
    }

    public int getValue()
    {
      return value;
    }

    public String toString()
    {
      return name;
    }

    private static transient EncodingType[] values;

    // NOTE: assuming ordinal == value
    static EncodingType get(final int ordinal)
    {
      EncodingType[] values = EncodingType.values;
      if (values == null) {
        values = EncodingType.values();
        EncodingType.values = values;
      }
      if (ordinal >= 0 && ordinal < values.length) {
        return values[ordinal];
      }
      return null;
    }

  }

  private static String
  findEncodingName(final int value)
  {
    EncodingType type = EncodingType.get(value);
    if (type == null) { return null; }
    assert type.value == value;
    return type.name;
  }

  private static String
  findEncodingName(ThreadContext context, IRubyObject encoding)
  {
    String rubyEncoding = null;
    if (encoding instanceof RubyString) {
      rubyEncoding = rubyStringToString((RubyString) encoding);
    } else if (encoding instanceof RubyFixnum) {
      rubyEncoding = findEncodingName(RubyFixnum.fix2int((RubyFixnum) encoding));
    }
    if (rubyEncoding == null) { return null; }
    try {
      return Charset.forName(rubyEncoding).displayName();
    } catch (UnsupportedCharsetException e) {
      throw context.getRuntime().newEncodingCompatibilityError(rubyEncoding + "is not supported");
    } catch (IllegalCharsetNameException e) {
      throw context.getRuntime().newEncodingError(e.getMessage());
    }
  }

  private static final Pattern CHARSET_PATTERN = Pattern.compile("charset(()|\\s)=(()|\\s)([a-z]|-|_|\\d)+",
      Pattern.CASE_INSENSITIVE);

  private static CharSequence
  applyEncoding(final String input, final String enc)
  {
    int start_pos = 0;
    int end_pos = 0;
    if (containsIgnoreCase(input, "charset")) {
      Matcher m = CHARSET_PATTERN.matcher(input);
      while (m.find()) {
        start_pos = m.start();
        end_pos = m.end();
      }
    }
    if (start_pos != end_pos) {
      return new StringBuilder(input).replace(start_pos, end_pos, "charset=" + enc);
    }
    return input;
  }

  private static boolean
  containsIgnoreCase(final String str, final String sub)
  {
    final int len = sub.length();
    final int max = str.length() - len;

    if (len == 0) { return true; }
    final char c0Lower = Character.toLowerCase(sub.charAt(0));
    final char c0Upper = Character.toUpperCase(sub.charAt(0));

    for (int i = 0; i <= max; i++) {
      final char ch = str.charAt(i);
      if (ch != c0Lower && Character.toLowerCase(ch) != c0Lower && Character.toUpperCase(ch) != c0Upper) {
        continue; // first char doesn't match
      }

      if (str.regionMatches(true, i + 1, sub, 0 + 1, len - 1)) {
        return true;
      }
    }
    return false;
  }

  @JRubyMethod(name = "file", meta = true)
  public static IRubyObject
  parse_file(ThreadContext context,
             IRubyObject klass,
             IRubyObject data,
             IRubyObject encoding)
  {
    if (!(data instanceof RubyString)) {
      throw context.getRuntime().newTypeError("data must be kind_of String");
    }
    if (!(encoding instanceof RubyString)) {
      throw context.getRuntime().newTypeError("data must be kind_of String");
    }

    Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klass);
    ctx.setInputSourceFile(context, data);
    String javaEncoding = findEncodingName(context, encoding);
    if (javaEncoding != null) {
      ctx.getInputSource().setEncoding(javaEncoding);
    }
    return ctx;
  }

  @JRubyMethod(name = "io", meta = true)
  public static IRubyObject
  parse_io(ThreadContext context,
           IRubyObject klass,
           IRubyObject data,
           IRubyObject encoding)
  {
    if (!(encoding instanceof RubyFixnum)) {
      throw context.getRuntime().newTypeError("encoding must be kind_of String");
    }

    Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klass);
    ctx.setIOInputSource(context, data, context.nil);
    String javaEncoding = findEncodingName(context, encoding);
    if (javaEncoding != null) {
      ctx.getInputSource().setEncoding(javaEncoding);
    }
    return ctx;
  }

  /**
   * Create a new parser context that will read from a raw input stream.
   * Meant to be run in a separate thread by Html4SaxPushParser.
   */
  static Html4SaxParserContext
  parse_stream(final Ruby runtime, RubyClass klass, InputStream stream)
  {
    Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(runtime, klass);
    ctx.setInputSource(stream);
    return ctx;
  }

  @Override
  protected void
  preParse(final Ruby runtime, IRubyObject handlerRuby, NokogiriHandler handler)
  {
    // this function is meant to be empty.  It overrides the one in XmlSaxParserContext
  }

}