From 508e3d109ffd4c91dd8f58fcbdf7d23c227c55be Mon Sep 17 00:00:00 2001 From: ArtOfCode- <hello@artofcode.co.uk> Date: Sun, 10 May 2020 22:08:36 +0100 Subject: [PATCH] XML transforms --- Gemfile | 3 ++ Gemfile.lock | 1 + scripts/import/dump_import.rb | 77 +++++++++++++++++++++++++++++++--- scripts/import/stack_import.rb | 33 +++++++++++++-- 4 files changed, 105 insertions(+), 9 deletions(-) diff --git a/Gemfile b/Gemfile index 2bf7f4d8d..e2873e8d2 100644 --- a/Gemfile +++ b/Gemfile @@ -52,6 +52,9 @@ gem 'stackprof', '~> 0.2' gem 'e2mmap', '~> 0.1' gem 'thwait', '~> 0.1' +# Stuff for imports +gem 'ruby-progressbar', '~> 1.10' + group :test do gem 'minitest', '~> 5.10.3' gem 'minitest-ci', '~> 3.4.0' diff --git a/Gemfile.lock b/Gemfile.lock index b00d4d145..d4b713c6f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -309,6 +309,7 @@ DEPENDENCIES rotp (~> 6.0) rqrcode (~> 1.1) rubocop (~> 0.81) + ruby-progressbar (~> 1.10) sass-rails (~> 5.0) spring (~> 2.1) stackprof (~> 0.2) diff --git a/scripts/import/dump_import.rb b/scripts/import/dump_import.rb index 81c40d56f..eb25f459b 100644 --- a/scripts/import/dump_import.rb +++ b/scripts/import/dump_import.rb @@ -1,4 +1,38 @@ class DumpImport + def self.posts_field_map(community_id, category_id, site_domain) + { + id: :id, + post_type_id: :post_type_id, + created_at: :creation_date, + score: :score, + body: :body, + body_markdown: :body, + user_id: :owner_user_id, + last_activity: :last_activity_date, + title: :title, + tags_cache: Proc.new { |row| transform_tags(row) }, + answer_count: :answer_count, + parent_id: :parent_id, + att_source: Proc.new { |row| "https://#{site_domain}#{row['post_type_id'] == '1' ? '/q/' : '/a/'}#{row['id']}" }, + att_license_name: Proc.new { |row| determine_license(row)[0] }, + att_license_link: Proc.new { |row| determine_license(row)[1] }, + community_id: community_id, + category_id: category_id + } + end + + def self.users_field_map(site_domain) + { + id: :id, + created_at: :creation_date, + username: :display_name, + website: :website_url, + profile: Proc.new { |row| generate_profile(row, site_domain) }, + profile_markdown: Proc.new { |row| generate_profile(row, site_domain) }, + se_acct_id: :account_id + } + end + def self.transform_tags(row) tags = row['tags']&.split('><')&.map { |t| t.gsub(/[<>]/, '') } tags.nil? ? nil : "---\n- " + tags.join("\n- ") @@ -13,8 +47,8 @@ class DumpImport end end - def self.generate_profile(row) - profile_url = "https://#{SITE}/u/#{row['id']}" + def self.generate_profile(row, site_domain) + profile_url = "https://#{site_domain}/u/#{row['id']}" "<p>This user was automatically created as the author of content sourced from Stack Exchange.</p>" \ "<p>The original profile on Stack Exchange can be found here: <a href=\"#{profile_url}\">#{profile_url}</a>" end @@ -33,8 +67,41 @@ class DumpImport input_file_path = File.join(dump_path, "#{data_type}.xml") output_file_path = File.join(dump_path, "#{data_type}_Formatted.xml") - + field_map = case data_type + when 'Posts' + DumpImport.posts_field_map(community_id, category_id, site_domain) + when 'Users' + DumpImport.users_field_map(site_domain) + else + raise ArgumentError, "Unsupported data type #{data_type.inspect}" + end + + document = Nokogiri::XML(File.read(input_file_path)) + rows = document.css("#{data_type.downcase} row").to_a + rows = rows.map { |r| r.attributes.map { |n, a| [n.underscore, a.content] }.to_h } + + progress = ProgressBar.create(title: "#{data_type} (#{rows.size})", total: rows.size, progress_mark: '█') + + builder = Nokogiri::XML::Builder.new do |xml| + xml.resultset do + rows.each do |row| + xml.row do + field_map.each do |field, source| + if source.is_a? Symbol + xml.send(field, row[source.to_s]) + elsif source.is_a? Proc + xml.send(field, source.call(row)) + else + xml.send(field, source) + end + end + end + progress.increment + end + end + end + + File.write(output_file_path, builder.to_xml) + rows end end - -DumpImport.do_xml_transform \ No newline at end of file diff --git a/scripts/import/stack_import.rb b/scripts/import/stack_import.rb index e1e452a2d..27b0b4710 100644 --- a/scripts/import/stack_import.rb +++ b/scripts/import/stack_import.rb @@ -3,6 +3,8 @@ require 'optparse' require 'open-uri' require 'csv' +require_relative 'dump_import' + $logger = ::Logger.new(STDOUT) $logger.level = :info @@ -19,9 +21,25 @@ def msg2str(msg) end $logger.formatter = proc do |severity, time, progname, msg| - colors = { 'DEBUG' => "\033[0;37m", 'INFO' => "\033[1;36m", 'WARN' => "\033[1;33m", 'ERROR' => "\033[1;31m", 'FATAL' => "\033[0;31m" } - "%s, [%s #%d] %s%5s%s -- %s: %s\n" % [severity[0..0], time.strftime('%Y-%m-%d %H:%M:%S'), $$, colors[severity], severity, - "\033[0m", progname, msg2str(msg)] + colors = { 'DEBUG' => "\033[0;37m", 'INFO' => "\033[1;36m", 'WARN' => "\033[1;33m", 'ERROR' => "\033[1;31m", + 'FATAL' => "\033[0;31m" } + "%s, [%s #%d] %s%5s%s -- %s: %s\n" % [severity[0..0], time.strftime('%Y-%m-%d %H:%M:%S'), $$, colors[severity], + severity, "\033[0m", progname, msg2str(msg)] +end + +def domain_from_api_param(api_param) + nonstandard = { + stackoverflow: '.com', + superuser: '.com', + serverfault: '.net', + askubuntu: '.com', + mathoverflow: '.net' + } + if nonstandard.keys.include? api_param.to_sym + "#{api_param}#{nonstandard[api_param.to_sym]}" + else + "#{api_param}.stackexchange.com" + end end ERROR_CODES = { @@ -86,9 +104,16 @@ unless @options.query.present? end unless @options.key.present? - $logger.warn 'No key specified. Can run without one, but only for a limited run. Large imports will require a key for added quota.' + $logger.warn 'No key specified. Can run without one, but only for a limited run. Large imports will require a key ' \ + 'for added quota.' end RequestContext.community = Community.find(@options.community) +# ==================================================================================================================== # + +domain = domain_from_api_param(@options.site) +users = DumpImport.do_xml_transform(site_domain: domain, data_type: 'Users', dump_path: @options.path) +posts = DumpImport.do_xml_transform(site_domain: domain, data_type: 'Posts', community_id: @options.community, + category_id: @options.category, dump_path: @options.path) -- GitLab