XMLパースはPython速い。Ruby糞遅い。でもRubyにも希望はある。それはOx!

qiita.com

rubyでも試してみた。

結論

rubyは糞遅い。この差はやばいね。pure rubyだと100倍違う。nokogiriでも5秒かかるとか。
これだけ見るとrubyはもうダメだ、という結論になってしまうね。

<追記>

悔しいじゃないですかぁー。いろいろためしましたよー。
Oxというのが良いカンジ。用途によってはNokogiriより使いやすい。速い。libxmlに依存していないらしい。インストールも速い。
遅いPythonぐらいまでは来ている。Oxで良いじゃんとなる。
www.reddit.com
ライブラリを選ぶときは用途とかパフォーマンスを調べる必要がありますね。特に車輪の再開発が多いRubyでは。今回Oxというライブラリを初めて使い、今まではXMLパーサーはNokogiriしか考えたことなかったけど、ちょっと態度を改めるべきですね。っていうかNokogiriはインストールあんなに遅いし、よくインストール失敗するし、なんでRubyデファクトスタンダードに居座っているのかしら?高機能なの?そんなみんな使いこなしているのかしら?

python

parse時間: 0.16052603721618652秒

rexml

parse時間: 16.04039192199707

nokogiri(Slop)

parse時間: 4.752880811691284

nokogiri(XML)

parse時間: 3.484325885772705
XMLにしたらいくらか早くなったけどpythonとの差は絶望感しか無い。

Ox

parse時間: 0.2525479793548584
おっと良いカンジ。

Oga

parse時間: 3.470038890838623

Oga(IO渡し)

parse時間: 3.2525930404663086

pure rubyのライブラリrexmlでのソース

#! /usr/bin/env ruby
# coding: utf-8
# frozen_string_literal: true

require 'rexml/document'

class ImageInfo
  attr_accessor :url, :width, :height
end

class BookInfo
  attr_accessor :asin, :title, :binding, :author, :publisher, :publicationDate, :images
  def initialize
    @images = {}
  end
end

def get_text(dom, tag)
  if dom.elements[tag]
    dom.elements[tag].text
  else
    ''
  end
end

def parse_xmls(xmls)
  bookinfos = []
  xmls.each do |xmlstring|
    doc = REXML::Document.new(xmlstring)
    image_labels = %w(SmallImage MediumImage LargeImage)
    doc.elements.each('ItemLookupResponse/Items/Item') do |item|
      bookinfo = BookInfo.new
      bookinfo.asin = item.elements['ASIN'].text
      attr = item.elements['ItemAttributes']
      bookinfo.title = get_text(attr, 'Title')
      bookinfo.binding = get_text(attr, 'Binding')
      bookinfo.author = get_text(attr, 'Author')
      bookinfo.publisher = get_text(attr, 'Publisher')
      bookinfo.publicationDate = get_text(attr, 'PublicationDate')

      image_labels.each do |image_label|
        next unless item.elements[image_label]
        image = ImageInfo.new
        imgtag = item.elements[image_label]
        image.url = imgtag.elements['URL'].text
        image.width = imgtag.elements['Width'].text.to_i
        image.height = imgtag.elements['Height'].text.to_i
        bookinfo.images[image_label] = image
      end
      bookinfos << bookinfo
    end
  end
  bookinfos
end

def get_xmls
  xmls = []
  Dir.glob('xmls/*.xml').each do |file|
    open(file, 'r') do |io|
      xmls << io.read
    end
  end
  xmls
end

xmls = get_xmls
t = Time.now
bookinfos = parse_xmls(xmls)
duration = (Time.now.to_f - t.to_f)
puts "xml数: #{xmls.size}"
puts "book数: #{bookinfos.size}"
puts "parse時間: #{duration}"

nokogiri

#! /usr/bin/env ruby
# coding: utf-8
# frozen_string_literal: true

# require 'rexml/document'
require 'nokogiri'

class ImageInfo
  attr_accessor :url, :width, :height
end

class BookInfo
  attr_accessor :asin, :title, :binding, :author, :publisher, :publicationDate, :images
  def initialize
    @images = {}
  end
end

def get_text(dom, tag)
  if dom.css(tag)
    dom.css(tag).text
  else
    ''
  end
end

def parse_xmls(xmls)
  bookinfos = []
  xmls.each do |xmlstring|
    doc = Nokogiri::Slop(xmlstring)
    image_labels = %w[SmallImage MediumImage LargeImage]
    doc.css('Item').each do |item|
      bookinfo = BookInfo.new
      bookinfo.asin = item.css('ASIN').text
      attr = item.css('ItemAttributes')
      bookinfo.title = get_text(attr, 'Title')
      bookinfo.binding = get_text(attr, 'Binding')
      bookinfo.author = get_text(attr, 'Author')
      bookinfo.publisher = get_text(attr, 'Publisher')
      bookinfo.publicationDate = get_text(attr, 'PublicationDate')

      image_labels.each do |image_label|
        next unless item.css(image_label)
        image = ImageInfo.new
        imgtag = item.css(image_label)
        image.url = imgtag.css('URL').text
        image.width = imgtag.css('Width').text.to_i
        image.height = imgtag.css('Height').text.to_i
        bookinfo.images[image_label] = image
      end
      bookinfos << bookinfo
    end
  end
  bookinfos
end

def get_xmls
  xmls = []
  Dir.glob('xmls/*.xml').each do |file|
    open(file, 'r') do |io|
      xmls << io.read
    end
  end
  xmls
end

xmls = get_xmls
t = Time.now
bookinfos = parse_xmls(xmls)
duration = (Time.now.to_f - t.to_f)
puts "xml数: #{xmls.size}"
puts "book数: #{bookinfos.size}"
puts "parse時間: #{duration}"

nokogiri(XML+追い込み)

#! /usr/bin/env ruby
# coding: utf-8
# frozen_string_literal: false

# require 'rexml/document'
require 'nokogiri'

class ImageInfo
  attr_accessor :url, :width, :height
end

class BookInfo
  attr_accessor :asin, :title, :binding, :author, :publisher, :publicationDate, :images
  def initialize
    @images = {}
  end
end

def get_text(dom, tag)
  if dom.css(tag).size.positive?
    dom.css(tag).text
  else
    ''
  end
end

def parse_xmls(xmls)
  bookinfos = []
  xmls.each do |xmlstring|
    doc = Nokogiri::XML(xmlstring)
    image_labels = %w[SmallImage MediumImage LargeImage]
    doc.css('Item').each do |item|
      bookinfo = BookInfo.new
      bookinfo.asin = item.css('ASIN').text
      attr = item.css('ItemAttributes')
      bookinfo.title = get_text(attr, 'Title')
      bookinfo.binding = get_text(attr, 'Binding')
      bookinfo.author = get_text(attr, 'Author')
      bookinfo.publisher = get_text(attr, 'Publisher')
      bookinfo.publicationDate = get_text(attr, 'PublicationDate')

      image_labels.each do |image_label|
        next unless item.css(image_label).size.positive?
        image = ImageInfo.new
        imgtag = item.css(image_label)[0]
        image.url = imgtag.css('URL').text
        image.width = imgtag.css('Width').text.to_i
        image.height = imgtag.css('Height').text.to_i
        bookinfo.images[image_label] = image
      end
      bookinfos << bookinfo
    end
  end
  bookinfos
end

def get_xmls
  xmls = []
  Dir.glob('xmls/*.xml').each do |file|
    open(file, 'r') do |io|
      xmls << io.read
    end
  end
  xmls
end

xmls = get_xmls
t = Time.now
bookinfos = parse_xmls(xmls)
duration = (Time.now.to_f - t.to_f)
puts "xml数: #{xmls.size}"
puts "book数: #{bookinfos.size}"
puts "parse時間: #{duration}"

Ox

#! /usr/bin/env ruby
# coding: utf-8
# frozen_string_literal: false

# require 'rexml/document'
# require 'nokogiri'
require 'ox'

class ImageInfo
  attr_accessor :url, :width, :height
end

class BookInfo
  attr_accessor :asin, :title, :binding, :author, :publisher, :publicationDate, :images
  def initialize
    @images = {}
  end
end

def get_text(dom, tag)
  if dom.locate(tag).size.positive?
    dom.locate(tag)[0].text
  else
    ''
  end
end

def parse_xmls(xmls)
  bookinfos = []
  xmls.each do |xmlstring|
    doc = Ox.parse(xmlstring)
    image_labels = %w[SmallImage MediumImage LargeImage]
    doc.locate('ItemLookupResponse/Items/Item').each do |item|
      bookinfo = BookInfo.new
      bookinfo.asin = item.ASIN.text
      attr = item.ItemAttributes
      bookinfo.title = get_text(attr, 'Title')
      bookinfo.binding = get_text(attr, 'Binding')
      bookinfo.author = get_text(attr, 'Author')
      bookinfo.publisher = get_text(attr, 'Publisher')
      bookinfo.publicationDate = get_text(attr, 'PublicationDate')

      image_labels.each do |image_label|
        next unless item.locate(image_label).size.positive?
        image = ImageInfo.new
        imgtag = item.locate(image_label)[0]
        image.url = imgtag.URL.text
        image.width = imgtag.Width.text.to_i
        image.height = imgtag.Height.text.to_i
        bookinfo.images[image_label] = image
      end
      bookinfos << bookinfo
    end
  end
  bookinfos
end

def get_xmls
  xmls = []
  Dir.glob('xmls/*.xml').each do |file|
    open(file, 'r') do |io|
      xmls << io.read
    end
  end
  xmls
end

xmls = get_xmls
t = Time.now
bookinfos = parse_xmls(xmls)
duration = (Time.now.to_f - t.to_f)
puts "xml数: #{xmls.size}"
puts "book数: #{bookinfos.size}"
puts "parse時間: #{duration}"

oga

#! /usr/bin/env ruby
# coding: utf-8
# frozen_string_literal: false

# require 'rexml/document'
# require 'nokogiri'
# require 'ox'
require 'oga'

class ImageInfo
  attr_accessor :url, :width, :height
end

class BookInfo
  attr_accessor :asin, :title, :binding, :author, :publisher, :publicationDate, :images
  def initialize
    @images = {}
  end
end

def get_text(dom, tag)
  if dom.xpath(tag).size.positive?
    dom.xpath(tag)[0].text
  else
    ''
  end
end

def parse_xmls(xmls)
  bookinfos = []
  xmls.each do |xmlstring|
    doc = Oga.parse_xml(xmlstring)
    image_labels = %w[SmallImage MediumImage LargeImage]
    doc.xpath('ItemLookupResponse/Items/Item').each do |item|
      bookinfo = BookInfo.new
      bookinfo.asin = item.xpath('ASIN').text
      bookinfo.title = get_text(item, 'ItemAttributes/Title')
      bookinfo.binding = get_text(item, 'ItemAttributes/Binding')
      bookinfo.author = get_text(item, 'ItemAttributes/Author')
      bookinfo.publisher = get_text(item, 'ItemAttributes/Publisher')
      bookinfo.publicationDate = get_text(item, 'ItemAttributes/PublicationDate')

      image_labels.each do |image_label|
        next unless item.xpath(image_label).size.positive?
        image = ImageInfo.new
        imgtag = item.xpath(image_label)[0]
        image.url = imgtag.xpath('URL').text
        image.width = imgtag.xpath('Width').text.to_i
        image.height = imgtag.xpath('Height').text.to_i
        bookinfo.images[image_label] = image
      end
      bookinfos << bookinfo
    end
  end
  bookinfos
end

def get_xmls
  xmls = []
  Dir.glob('xmls/*.xml').each do |file|
    open(file, 'r') do |io|
      xmls << io.read
    end
  end
  xmls
end

xmls = get_xmls
t = Time.now
bookinfos = parse_xmls(xmls)
duration = (Time.now.to_f - t.to_f)
puts "xml数: #{xmls.size}"
puts "book数: #{bookinfos.size}"
puts "parse時間: #{duration}"
oga(IO渡し)
def get_xmls
  xmls = []
  Dir.glob('xmls/*.xml').each do |file|
    xmls << open(file, 'r')
  end
  xmls
end