Skip to content
Snippets Groups Projects
Commit 508e3d10 authored by ArtOfCode-'s avatar ArtOfCode-
Browse files

XML transforms

parent e268277b
Branches
Tags
No related merge requests found
...@@ -52,6 +52,9 @@ gem 'stackprof', '~> 0.2' ...@@ -52,6 +52,9 @@ gem 'stackprof', '~> 0.2'
gem 'e2mmap', '~> 0.1' gem 'e2mmap', '~> 0.1'
gem 'thwait', '~> 0.1' gem 'thwait', '~> 0.1'
# Stuff for imports
gem 'ruby-progressbar', '~> 1.10'
group :test do group :test do
gem 'minitest', '~> 5.10.3' gem 'minitest', '~> 5.10.3'
gem 'minitest-ci', '~> 3.4.0' gem 'minitest-ci', '~> 3.4.0'
......
...@@ -309,6 +309,7 @@ DEPENDENCIES ...@@ -309,6 +309,7 @@ DEPENDENCIES
rotp (~> 6.0) rotp (~> 6.0)
rqrcode (~> 1.1) rqrcode (~> 1.1)
rubocop (~> 0.81) rubocop (~> 0.81)
ruby-progressbar (~> 1.10)
sass-rails (~> 5.0) sass-rails (~> 5.0)
spring (~> 2.1) spring (~> 2.1)
stackprof (~> 0.2) stackprof (~> 0.2)
......
class DumpImport class DumpImport
def self.posts_field_map(community_id, category_id, site_domain)
{
id: :id,
post_type_id: :post_type_id,
created_at: :creation_date,
score: :score,
body: :body,
body_markdown: :body,
user_id: :owner_user_id,
last_activity: :last_activity_date,
title: :title,
tags_cache: Proc.new { |row| transform_tags(row) },
answer_count: :answer_count,
parent_id: :parent_id,
att_source: Proc.new { |row| "https://#{site_domain}#{row['post_type_id'] == '1' ? '/q/' : '/a/'}#{row['id']}" },
att_license_name: Proc.new { |row| determine_license(row)[0] },
att_license_link: Proc.new { |row| determine_license(row)[1] },
community_id: community_id,
category_id: category_id
}
end
def self.users_field_map(site_domain)
{
id: :id,
created_at: :creation_date,
username: :display_name,
website: :website_url,
profile: Proc.new { |row| generate_profile(row, site_domain) },
profile_markdown: Proc.new { |row| generate_profile(row, site_domain) },
se_acct_id: :account_id
}
end
def self.transform_tags(row) def self.transform_tags(row)
tags = row['tags']&.split('><')&.map { |t| t.gsub(/[<>]/, '') } tags = row['tags']&.split('><')&.map { |t| t.gsub(/[<>]/, '') }
tags.nil? ? nil : "---\n- " + tags.join("\n- ") tags.nil? ? nil : "---\n- " + tags.join("\n- ")
...@@ -13,8 +47,8 @@ class DumpImport ...@@ -13,8 +47,8 @@ class DumpImport
end end
end end
def self.generate_profile(row) def self.generate_profile(row, site_domain)
profile_url = "https://#{SITE}/u/#{row['id']}" profile_url = "https://#{site_domain}/u/#{row['id']}"
"<p>This user was automatically created as the author of content sourced from Stack Exchange.</p>" \ "<p>This user was automatically created as the author of content sourced from Stack Exchange.</p>" \
"<p>The original profile on Stack Exchange can be found here: <a href=\"#{profile_url}\">#{profile_url}</a>" "<p>The original profile on Stack Exchange can be found here: <a href=\"#{profile_url}\">#{profile_url}</a>"
end end
...@@ -33,8 +67,41 @@ class DumpImport ...@@ -33,8 +67,41 @@ class DumpImport
input_file_path = File.join(dump_path, "#{data_type}.xml") input_file_path = File.join(dump_path, "#{data_type}.xml")
output_file_path = File.join(dump_path, "#{data_type}_Formatted.xml") output_file_path = File.join(dump_path, "#{data_type}_Formatted.xml")
field_map = case data_type
when 'Posts'
DumpImport.posts_field_map(community_id, category_id, site_domain)
when 'Users'
DumpImport.users_field_map(site_domain)
else
raise ArgumentError, "Unsupported data type #{data_type.inspect}"
end
document = Nokogiri::XML(File.read(input_file_path))
rows = document.css("#{data_type.downcase} row").to_a
rows = rows.map { |r| r.attributes.map { |n, a| [n.underscore, a.content] }.to_h }
progress = ProgressBar.create(title: "#{data_type} (#{rows.size})", total: rows.size, progress_mark: '█')
builder = Nokogiri::XML::Builder.new do |xml|
xml.resultset do
rows.each do |row|
xml.row do
field_map.each do |field, source|
if source.is_a? Symbol
xml.send(field, row[source.to_s])
elsif source.is_a? Proc
xml.send(field, source.call(row))
else
xml.send(field, source)
end
end
end
progress.increment
end
end end
end end
DumpImport.do_xml_transform File.write(output_file_path, builder.to_xml)
\ No newline at end of file rows
end
end
...@@ -3,6 +3,8 @@ require 'optparse' ...@@ -3,6 +3,8 @@ require 'optparse'
require 'open-uri' require 'open-uri'
require 'csv' require 'csv'
require_relative 'dump_import'
$logger = ::Logger.new(STDOUT) $logger = ::Logger.new(STDOUT)
$logger.level = :info $logger.level = :info
...@@ -19,9 +21,25 @@ def msg2str(msg) ...@@ -19,9 +21,25 @@ def msg2str(msg)
end end
$logger.formatter = proc do |severity, time, progname, msg| $logger.formatter = proc do |severity, time, progname, msg|
colors = { 'DEBUG' => "\033[0;37m", 'INFO' => "\033[1;36m", 'WARN' => "\033[1;33m", 'ERROR' => "\033[1;31m", 'FATAL' => "\033[0;31m" } colors = { 'DEBUG' => "\033[0;37m", 'INFO' => "\033[1;36m", 'WARN' => "\033[1;33m", 'ERROR' => "\033[1;31m",
"%s, [%s #%d] %s%5s%s -- %s: %s\n" % [severity[0..0], time.strftime('%Y-%m-%d %H:%M:%S'), $$, colors[severity], severity, 'FATAL' => "\033[0;31m" }
"\033[0m", progname, msg2str(msg)] "%s, [%s #%d] %s%5s%s -- %s: %s\n" % [severity[0..0], time.strftime('%Y-%m-%d %H:%M:%S'), $$, colors[severity],
severity, "\033[0m", progname, msg2str(msg)]
end
def domain_from_api_param(api_param)
nonstandard = {
stackoverflow: '.com',
superuser: '.com',
serverfault: '.net',
askubuntu: '.com',
mathoverflow: '.net'
}
if nonstandard.keys.include? api_param.to_sym
"#{api_param}#{nonstandard[api_param.to_sym]}"
else
"#{api_param}.stackexchange.com"
end
end end
ERROR_CODES = { ERROR_CODES = {
...@@ -86,9 +104,16 @@ unless @options.query.present? ...@@ -86,9 +104,16 @@ unless @options.query.present?
end end
unless @options.key.present? unless @options.key.present?
$logger.warn 'No key specified. Can run without one, but only for a limited run. Large imports will require a key for added quota.' $logger.warn 'No key specified. Can run without one, but only for a limited run. Large imports will require a key ' \
'for added quota.'
end end
RequestContext.community = Community.find(@options.community) RequestContext.community = Community.find(@options.community)
# ==================================================================================================================== #
domain = domain_from_api_param(@options.site)
users = DumpImport.do_xml_transform(site_domain: domain, data_type: 'Users', dump_path: @options.path)
posts = DumpImport.do_xml_transform(site_domain: domain, data_type: 'Posts', community_id: @options.community,
category_id: @options.category, dump_path: @options.path)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment