XMLパースはPython速い。Ruby糞遅い。でもRubyにも希望はある。それはOx!
rubyでも試してみた。
結論
rubyは糞遅い。この差はやばいね。pure rubyだと100倍違う。nokogiriでも5秒かかるとか。
これだけ見るとrubyはもうダメだ、という結論になってしまうね。
<追記>
悔しいじゃないですかぁー。いろいろためしましたよー。
Oxというのが良いカンジ。用途によってはNokogiriより使いやすい。速い。libxmlに依存していないらしい。インストールも速い。
遅いPythonぐらいまでは来ている。Oxで良いじゃんとなる。
www.reddit.com
ライブラリを選ぶときは用途とかパフォーマンスを調べる必要がありますね。特に車輪の再開発が多いRubyでは。今回Oxというライブラリを初めて使い、今まではXMLパーサーはNokogiriしか考えたことなかったけど、ちょっと態度を改めるべきですね。っていうかNokogiriはインストールあんなに遅いし、よくインストール失敗するし、なんでRubyのデファクトスタンダードに居座っているのかしら?高機能なの?そんなみんな使いこなしているのかしら?
python
parse時間: 0.16052603721618652秒
rexml
parse時間: 16.04039192199707
nokogiri(Slop)
parse時間: 4.752880811691284
Ox
parse時間: 0.2525479793548584
おっと良いカンジ。
Oga
parse時間: 3.470038890838623
Oga(IO渡し)
parse時間: 3.2525930404663086
pure rubyのライブラリrexmlでのソース
#! /usr/bin/env ruby # coding: utf-8 # frozen_string_literal: true require 'rexml/document' class ImageInfo attr_accessor :url, :width, :height end class BookInfo attr_accessor :asin, :title, :binding, :author, :publisher, :publicationDate, :images def initialize @images = {} end end def get_text(dom, tag) if dom.elements[tag] dom.elements[tag].text else '' end end def parse_xmls(xmls) bookinfos = [] xmls.each do |xmlstring| doc = REXML::Document.new(xmlstring) image_labels = %w(SmallImage MediumImage LargeImage) doc.elements.each('ItemLookupResponse/Items/Item') do |item| bookinfo = BookInfo.new bookinfo.asin = item.elements['ASIN'].text attr = item.elements['ItemAttributes'] bookinfo.title = get_text(attr, 'Title') bookinfo.binding = get_text(attr, 'Binding') bookinfo.author = get_text(attr, 'Author') bookinfo.publisher = get_text(attr, 'Publisher') bookinfo.publicationDate = get_text(attr, 'PublicationDate') image_labels.each do |image_label| next unless item.elements[image_label] image = ImageInfo.new imgtag = item.elements[image_label] image.url = imgtag.elements['URL'].text image.width = imgtag.elements['Width'].text.to_i image.height = imgtag.elements['Height'].text.to_i bookinfo.images[image_label] = image end bookinfos << bookinfo end end bookinfos end def get_xmls xmls = [] Dir.glob('xmls/*.xml').each do |file| open(file, 'r') do |io| xmls << io.read end end xmls end xmls = get_xmls t = Time.now bookinfos = parse_xmls(xmls) duration = (Time.now.to_f - t.to_f) puts "xml数: #{xmls.size}" puts "book数: #{bookinfos.size}" puts "parse時間: #{duration}"
nokogiri
#! /usr/bin/env ruby # coding: utf-8 # frozen_string_literal: true # require 'rexml/document' require 'nokogiri' class ImageInfo attr_accessor :url, :width, :height end class BookInfo attr_accessor :asin, :title, :binding, :author, :publisher, :publicationDate, :images def initialize @images = {} end end def get_text(dom, tag) if dom.css(tag) dom.css(tag).text else '' end end def parse_xmls(xmls) bookinfos = [] xmls.each do |xmlstring| doc = Nokogiri::Slop(xmlstring) image_labels = %w[SmallImage MediumImage LargeImage] doc.css('Item').each do |item| bookinfo = BookInfo.new bookinfo.asin = item.css('ASIN').text attr = item.css('ItemAttributes') bookinfo.title = get_text(attr, 'Title') bookinfo.binding = get_text(attr, 'Binding') bookinfo.author = get_text(attr, 'Author') bookinfo.publisher = get_text(attr, 'Publisher') bookinfo.publicationDate = get_text(attr, 'PublicationDate') image_labels.each do |image_label| next unless item.css(image_label) image = ImageInfo.new imgtag = item.css(image_label) image.url = imgtag.css('URL').text image.width = imgtag.css('Width').text.to_i image.height = imgtag.css('Height').text.to_i bookinfo.images[image_label] = image end bookinfos << bookinfo end end bookinfos end def get_xmls xmls = [] Dir.glob('xmls/*.xml').each do |file| open(file, 'r') do |io| xmls << io.read end end xmls end xmls = get_xmls t = Time.now bookinfos = parse_xmls(xmls) duration = (Time.now.to_f - t.to_f) puts "xml数: #{xmls.size}" puts "book数: #{bookinfos.size}" puts "parse時間: #{duration}"
nokogiri(XML+追い込み)
#! /usr/bin/env ruby # coding: utf-8 # frozen_string_literal: false # require 'rexml/document' require 'nokogiri' class ImageInfo attr_accessor :url, :width, :height end class BookInfo attr_accessor :asin, :title, :binding, :author, :publisher, :publicationDate, :images def initialize @images = {} end end def get_text(dom, tag) if dom.css(tag).size.positive? dom.css(tag).text else '' end end def parse_xmls(xmls) bookinfos = [] xmls.each do |xmlstring| doc = Nokogiri::XML(xmlstring) image_labels = %w[SmallImage MediumImage LargeImage] doc.css('Item').each do |item| bookinfo = BookInfo.new bookinfo.asin = item.css('ASIN').text attr = item.css('ItemAttributes') bookinfo.title = get_text(attr, 'Title') bookinfo.binding = get_text(attr, 'Binding') bookinfo.author = get_text(attr, 'Author') bookinfo.publisher = get_text(attr, 'Publisher') bookinfo.publicationDate = get_text(attr, 'PublicationDate') image_labels.each do |image_label| next unless item.css(image_label).size.positive? image = ImageInfo.new imgtag = item.css(image_label)[0] image.url = imgtag.css('URL').text image.width = imgtag.css('Width').text.to_i image.height = imgtag.css('Height').text.to_i bookinfo.images[image_label] = image end bookinfos << bookinfo end end bookinfos end def get_xmls xmls = [] Dir.glob('xmls/*.xml').each do |file| open(file, 'r') do |io| xmls << io.read end end xmls end xmls = get_xmls t = Time.now bookinfos = parse_xmls(xmls) duration = (Time.now.to_f - t.to_f) puts "xml数: #{xmls.size}" puts "book数: #{bookinfos.size}" puts "parse時間: #{duration}"
Ox
#! /usr/bin/env ruby # coding: utf-8 # frozen_string_literal: false # require 'rexml/document' # require 'nokogiri' require 'ox' class ImageInfo attr_accessor :url, :width, :height end class BookInfo attr_accessor :asin, :title, :binding, :author, :publisher, :publicationDate, :images def initialize @images = {} end end def get_text(dom, tag) if dom.locate(tag).size.positive? dom.locate(tag)[0].text else '' end end def parse_xmls(xmls) bookinfos = [] xmls.each do |xmlstring| doc = Ox.parse(xmlstring) image_labels = %w[SmallImage MediumImage LargeImage] doc.locate('ItemLookupResponse/Items/Item').each do |item| bookinfo = BookInfo.new bookinfo.asin = item.ASIN.text attr = item.ItemAttributes bookinfo.title = get_text(attr, 'Title') bookinfo.binding = get_text(attr, 'Binding') bookinfo.author = get_text(attr, 'Author') bookinfo.publisher = get_text(attr, 'Publisher') bookinfo.publicationDate = get_text(attr, 'PublicationDate') image_labels.each do |image_label| next unless item.locate(image_label).size.positive? image = ImageInfo.new imgtag = item.locate(image_label)[0] image.url = imgtag.URL.text image.width = imgtag.Width.text.to_i image.height = imgtag.Height.text.to_i bookinfo.images[image_label] = image end bookinfos << bookinfo end end bookinfos end def get_xmls xmls = [] Dir.glob('xmls/*.xml').each do |file| open(file, 'r') do |io| xmls << io.read end end xmls end xmls = get_xmls t = Time.now bookinfos = parse_xmls(xmls) duration = (Time.now.to_f - t.to_f) puts "xml数: #{xmls.size}" puts "book数: #{bookinfos.size}" puts "parse時間: #{duration}"
oga
#! /usr/bin/env ruby # coding: utf-8 # frozen_string_literal: false # require 'rexml/document' # require 'nokogiri' # require 'ox' require 'oga' class ImageInfo attr_accessor :url, :width, :height end class BookInfo attr_accessor :asin, :title, :binding, :author, :publisher, :publicationDate, :images def initialize @images = {} end end def get_text(dom, tag) if dom.xpath(tag).size.positive? dom.xpath(tag)[0].text else '' end end def parse_xmls(xmls) bookinfos = [] xmls.each do |xmlstring| doc = Oga.parse_xml(xmlstring) image_labels = %w[SmallImage MediumImage LargeImage] doc.xpath('ItemLookupResponse/Items/Item').each do |item| bookinfo = BookInfo.new bookinfo.asin = item.xpath('ASIN').text bookinfo.title = get_text(item, 'ItemAttributes/Title') bookinfo.binding = get_text(item, 'ItemAttributes/Binding') bookinfo.author = get_text(item, 'ItemAttributes/Author') bookinfo.publisher = get_text(item, 'ItemAttributes/Publisher') bookinfo.publicationDate = get_text(item, 'ItemAttributes/PublicationDate') image_labels.each do |image_label| next unless item.xpath(image_label).size.positive? image = ImageInfo.new imgtag = item.xpath(image_label)[0] image.url = imgtag.xpath('URL').text image.width = imgtag.xpath('Width').text.to_i image.height = imgtag.xpath('Height').text.to_i bookinfo.images[image_label] = image end bookinfos << bookinfo end end bookinfos end def get_xmls xmls = [] Dir.glob('xmls/*.xml').each do |file| open(file, 'r') do |io| xmls << io.read end end xmls end xmls = get_xmls t = Time.now bookinfos = parse_xmls(xmls) duration = (Time.now.to_f - t.to_f) puts "xml数: #{xmls.size}" puts "book数: #{bookinfos.size}" puts "parse時間: #{duration}"
oga(IO渡し)
def get_xmls xmls = [] Dir.glob('xmls/*.xml').each do |file| xmls << open(file, 'r') end xmls end