From 508e3d109ffd4c91dd8f58fcbdf7d23c227c55be Mon Sep 17 00:00:00 2001
From: ArtOfCode- <hello@artofcode.co.uk>
Date: Sun, 10 May 2020 22:08:36 +0100
Subject: [PATCH] XML transforms

---
 Gemfile                        |  3 ++
 Gemfile.lock                   |  1 +
 scripts/import/dump_import.rb  | 77 +++++++++++++++++++++++++++++++---
 scripts/import/stack_import.rb | 33 +++++++++++++--
 4 files changed, 105 insertions(+), 9 deletions(-)

diff --git a/Gemfile b/Gemfile
index 2bf7f4d8d..e2873e8d2 100644
--- a/Gemfile
+++ b/Gemfile
@@ -52,6 +52,9 @@ gem 'stackprof', '~> 0.2'
 gem 'e2mmap', '~> 0.1'
 gem 'thwait', '~> 0.1'
 
+# Stuff for imports
+gem 'ruby-progressbar', '~> 1.10'
+
 group :test do
   gem 'minitest', '~> 5.10.3'
   gem 'minitest-ci', '~> 3.4.0'
diff --git a/Gemfile.lock b/Gemfile.lock
index b00d4d145..d4b713c6f 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -309,6 +309,7 @@ DEPENDENCIES
   rotp (~> 6.0)
   rqrcode (~> 1.1)
   rubocop (~> 0.81)
+  ruby-progressbar (~> 1.10)
   sass-rails (~> 5.0)
   spring (~> 2.1)
   stackprof (~> 0.2)
diff --git a/scripts/import/dump_import.rb b/scripts/import/dump_import.rb
index 81c40d56f..eb25f459b 100644
--- a/scripts/import/dump_import.rb
+++ b/scripts/import/dump_import.rb
@@ -1,4 +1,38 @@
 class DumpImport
+  def self.posts_field_map(community_id, category_id, site_domain)
+    {
+      id: :id,
+      post_type_id: :post_type_id,
+      created_at: :creation_date,
+      score: :score,
+      body: :body,
+      body_markdown: :body,
+      user_id: :owner_user_id,
+      last_activity: :last_activity_date,
+      title: :title,
+      tags_cache: Proc.new { |row| transform_tags(row) },
+      answer_count: :answer_count,
+      parent_id: :parent_id,
+      att_source: Proc.new { |row| "https://#{site_domain}#{row['post_type_id'] == '1' ? '/q/' : '/a/'}#{row['id']}" },
+      att_license_name: Proc.new { |row| determine_license(row)[0] },
+      att_license_link: Proc.new { |row| determine_license(row)[1] },
+      community_id: community_id,
+      category_id: category_id
+    }
+  end
+
+  def self.users_field_map(site_domain)
+    {
+      id: :id,
+      created_at: :creation_date,
+      username: :display_name,
+      website: :website_url,
+      profile: Proc.new { |row| generate_profile(row, site_domain) },
+      profile_markdown: Proc.new { |row| generate_profile(row, site_domain) },
+      se_acct_id: :account_id
+    }
+  end
+
   def self.transform_tags(row)
     tags = row['tags']&.split('><')&.map { |t| t.gsub(/[<>]/, '') }
     tags.nil? ? nil : "---\n- " + tags.join("\n- ")
@@ -13,8 +47,8 @@ class DumpImport
     end
   end
 
-  def self.generate_profile(row)
-    profile_url = "https://#{SITE}/u/#{row['id']}"
+  def self.generate_profile(row, site_domain)
+    profile_url = "https://#{site_domain}/u/#{row['id']}"
     "<p>This user was automatically created as the author of content sourced from Stack Exchange.</p>" \
     "<p>The original profile on Stack Exchange can be found here: <a href=\"#{profile_url}\">#{profile_url}</a>"
   end
@@ -33,8 +67,41 @@ class DumpImport
     input_file_path = File.join(dump_path, "#{data_type}.xml")
     output_file_path = File.join(dump_path, "#{data_type}_Formatted.xml")
 
-    
+    field_map = case data_type
+                when 'Posts'
+                  DumpImport.posts_field_map(community_id, category_id, site_domain)
+                when 'Users'
+                  DumpImport.users_field_map(site_domain)
+                else
+                  raise ArgumentError, "Unsupported data type #{data_type.inspect}"
+                end
+
+    document = Nokogiri::XML(File.read(input_file_path))
+    rows = document.css("#{data_type.downcase} row").to_a
+    rows = rows.map { |r| r.attributes.map { |n, a| [n.underscore, a.content] }.to_h }
+
+    progress = ProgressBar.create(title: "#{data_type} (#{rows.size})", total: rows.size, progress_mark: '█')
+
+    builder = Nokogiri::XML::Builder.new do |xml|
+      xml.resultset do
+        rows.each do |row|
+          xml.row do
+            field_map.each do |field, source|
+              if source.is_a? Symbol
+                xml.send(field, row[source.to_s])
+              elsif source.is_a? Proc
+                xml.send(field, source.call(row))
+              else
+                xml.send(field, source)
+              end
+            end
+          end
+          progress.increment
+        end
+      end
+    end
+
+    File.write(output_file_path, builder.to_xml)
+    rows
   end
 end
-
-DumpImport.do_xml_transform
\ No newline at end of file
diff --git a/scripts/import/stack_import.rb b/scripts/import/stack_import.rb
index e1e452a2d..27b0b4710 100644
--- a/scripts/import/stack_import.rb
+++ b/scripts/import/stack_import.rb
@@ -3,6 +3,8 @@ require 'optparse'
 require 'open-uri'
 require 'csv'
 
+require_relative 'dump_import'
+
 $logger = ::Logger.new(STDOUT)
 $logger.level = :info
 
@@ -19,9 +21,25 @@ def msg2str(msg)
 end
 
 $logger.formatter = proc do |severity, time, progname, msg|
-  colors = { 'DEBUG' => "\033[0;37m", 'INFO' => "\033[1;36m", 'WARN' => "\033[1;33m", 'ERROR' => "\033[1;31m", 'FATAL' => "\033[0;31m" }
-  "%s, [%s #%d] %s%5s%s -- %s: %s\n" % [severity[0..0], time.strftime('%Y-%m-%d %H:%M:%S'), $$, colors[severity], severity,
-                                         "\033[0m", progname, msg2str(msg)]
+  colors = { 'DEBUG' => "\033[0;37m", 'INFO' => "\033[1;36m", 'WARN' => "\033[1;33m", 'ERROR' => "\033[1;31m",
+             'FATAL' => "\033[0;31m" }
+  "%s, [%s #%d] %s%5s%s -- %s: %s\n" % [severity[0..0], time.strftime('%Y-%m-%d %H:%M:%S'), $$, colors[severity],
+                                        severity, "\033[0m", progname, msg2str(msg)]
+end
+
+def domain_from_api_param(api_param)
+  nonstandard = {
+    stackoverflow: '.com',
+    superuser: '.com',
+    serverfault: '.net',
+    askubuntu: '.com',
+    mathoverflow: '.net'
+  }
+  if nonstandard.keys.include? api_param.to_sym
+    "#{api_param}#{nonstandard[api_param.to_sym]}"
+  else
+    "#{api_param}.stackexchange.com"
+  end
 end
 
 ERROR_CODES = {
@@ -86,9 +104,16 @@ unless @options.query.present?
 end
 
 unless @options.key.present?
-  $logger.warn 'No key specified. Can run without one, but only for a limited run. Large imports will require a key for added quota.'
+  $logger.warn 'No key specified. Can run without one, but only for a limited run. Large imports will require a key ' \
+               'for added quota.'
 end
 
 RequestContext.community = Community.find(@options.community)
 
+# ==================================================================================================================== #
+
+domain = domain_from_api_param(@options.site)
 
+users = DumpImport.do_xml_transform(site_domain: domain, data_type: 'Users', dump_path: @options.path)
+posts = DumpImport.do_xml_transform(site_domain: domain, data_type: 'Posts', community_id: @options.community,
+                                    category_id: @options.category, dump_path: @options.path)
-- 
GitLab