class Libis::Ingester::BbNotesCollector
Protected Instance Methods
Checks if a HTML file has been processed Note: this function keeps track of previously processed files in the class instance variable @html_processed and it emits a warning message if it encounters a dubplicate. @param [String] rel_path @param [Integer] line line number in CSV we're currently processing @return [Boolean] true if given file has not been processed before
# File lib/libis/ingester/tasks/bb_notes_collector.rb, line 230 def check_duplicate_html(rel_path, line) @html_processed ||= Set.new if @html_processed.include? rel_path warn 'Duplicate HTML file entry found in CSV: `%s` on line %d. Ignoring this duplicate entry.', rel_path, line return false end @html_processed << rel_path true end
Check if a file exists The function emits a warning message if the file does not exist. @param [String] rel_path @return [Boolean] true if file exists
# File lib/libis/ingester/tasks/bb_notes_collector.rb, line 244 def check_file_exist(rel_path) unless full_path(rel_path).exist? warn 'File `%s` not found in export directory. Ignoring this file reference.', rel_path return false end true end
Calculate abslute path using the location parameter as base dir @param [Array] rel_path list of relative paths @return [Pathname] full path name
# File lib/libis/ingester/tasks/bb_notes_collector.rb, line 198 def full_path(*rel_path) Pathname.new(parameter(:root_dir)).join(*rel_path) end
Check if file should be ignored @param [Pathname] rel_name file name to check @return [Boolean] true if file should be ignored
# File lib/libis/ingester/tasks/bb_notes_collector.rb, line 220 def ignore_file(rel_name) rel_name.to_s =~ /icons\/.*\.gif$/ || rel_name.to_s =~ /\/(TempBody.*|graycol)\.(gif|jpg)$/ end
Check if link should be ignored @param [Pathname] link link to check @return [Boolean] true if link should be ignored
# File lib/libis/ingester/tasks/bb_notes_collector.rb, line 212 def ignore_link(link) return true if link =~ /^(mailto:|http:)/ ignore_file(link) end
Convert link to Pathname @param [String] link @return [Pathname]
# File lib/libis/ingester/tasks/bb_notes_collector.rb, line 205 def link2path(link) Pathname.new(URI.unescape(link)).cleanpath.to_s end
Process the input directory on the FTP server for new material @param [Libis::Ingester::Run] item
# File lib/libis/ingester/tasks/bb_notes_collector.rb, line 62 def process(item) unless File.exists?(parameter(:csv_file)) csv_path = File.join(parameter(:root_dir), parameter(:csv_file)) if File.exists?(csv_path) parameter(:csv_file, csv_path) else raise Libis::WorkflowAbort, "CSV file '#{parameter(:csv_file)}' cannot not be found. It should be absolute or relative to 'root_dir'." end end # csv = Libis::Tools::Csv.open(parameter(:csv_file), mode: 'rb:windows-1252:UTF-8', required: %w'Pad') csv = Libis::Tools::Csv.open(parameter(:csv_file), col_sep: ';', required: %w'Pad TitelTX DocumentsvormTX DocumentdatumDT DossiersTX AuteurTX') unless parameter(:root_dir) =~ /\/documenten\/?$/ parameter(:root_dir, File.join(parameter(:root_dir), 'documenten')) end ie_count = 0 csv.each_with_index do |row, line| rel_path = row['Pad'].gsub(/^c:\\export\\documenten\\/, '').gsub(/\\/, '/') title = row['TitelTX'] doctype = row['DocumentsvormTX'] docdate = row['DocumentdatumDT'] docdate = (DateTime.strptime(docdate, "%d_%m_%Y %H_%M_%S") rescue Date.strptime(docdate, "%d_%m_%Y") rescue nil) docdossier = row['DossiersTX'] docauthor = row['AuteurTX'] next unless check_duplicate_html rel_path, line + 2 next unless check_file_exist rel_path ie_info = process_ie rel_path, title next unless ie_info # Create/find directory collection for path root = item root_dir = parameter(:root_dir) ie_info[:path].split('/').each do |dir| child = root.items.find_by('properties.name' => dir) dir_path = File.join(root_dir, dir) unless child child = Libis::Ingester::Collection.new child.filename = dir_path child.parent = root child.navigate = parameter(:collection_navigate) child.publish = parameter(:collection_publish) debug 'Created Collection item `%s`', root, child.name child.save! end root = child root_dir = dir_path end # Add IE object ie = Libis::Ingester::IntellectualEntity.new ie.name = ie_info[:filename] ie.label = ie_info[:title] ie.parent = root debug 'Created IE for `%s`', root, ie.name ie.save! # create DC metadata dc = Libis::Metadata::DublinCoreRecord.new dc.title = ie_info[:title] dc.type = doctype unless doctype.blank? dc.subject = docdossier unless docdossier.blank? # noinspection RubyResolve dc.created = docdate if docdate # noinspection RubyResolve dc.creator = docauthor unless docauthor.blank? # Add the metaddata to the IE metadata = Libis::Ingester::MetadataRecord.new metadata.format = 'DC' metadata.data = dc.to_xml ie.metadata_record = metadata ie.save! # Add HTML file to the IE file = Libis::Ingester::FileItem.new file.filename = full_path(File.join(ie_info[:path], ie_info[:filename])).to_s ie.add_item(file) debug 'Created File for `%s`', ie, file.filename file.save! # Add linked files and images to the IE ie_info[:links].each do |link| file = Libis::Ingester::FileItem.new file.filename = full_path(File.join(ie_info[:path], link)).to_s ie.add_item(file) debug 'Created File for `%s`', ie, file.filename file.save! end ie.save! ie_count += 1 item.status_progress self.namepath, ie_count end csv.close end
Process a single HTML file to retrieve the information needed to create an IE for it @param [String] rel_path path to the HTML file relative to the :location parameter @return [Hash] IE information structure: path, name, title, links and images
# File lib/libis/ingester/tasks/bb_notes_collector.rb, line 158 def process_ie(rel_path, title) rel_dir, fname = File.split(rel_path) title ||= File.basename(fname, '.*') f = File.open(full_path(rel_path), 'r:UTF-8') # noinspection RubyResolve html = Nokogiri::HTML(f) {|config| config.strict.nonet.noblanks} f.close # File links links = html.xpath('//a/@href').map(&:value).map {|link| link2path(link)}.reject {|link| ignore_link(link)} # Check if files referenced do exist links.reject! {|link| next false if full_path(File.join(rel_dir, link)).exist? warn 'File \'%s\' referenced in HTML file `%s` was not found. Reference will be ignored.', link, rel_path true } # Image links images = html.xpath('//img/@src').map(&:value).map {|link| link2path(link)}.reject {|i| ignore_file(i)} # Check if images referenced do exist images.reject! {|link| next false if full_path(File.join(rel_dir, link)).exist? warn 'Image \'%s\' referenced in HTML file `%s` was not found. Reference will be ignored.', link, rel_path true } # Remove duplicate links link_set = links.to_set + images.to_set unless link_set.count == links.count + images.count warn 'HTML file `%s` contains duplicate file references. Duplicates are ignored.', rel_path end # return result { path: rel_dir, filename: fname, title: title.strip, links: link_set, } end