class Libis::Ingester::MasterthesisCollector
Constants
- KUL_ID
Protected Instance Methods
process(item)
click to toggle source
Process the input directory on the FTP server for new material @param [Libis::Ingester::Run] item
# File lib/libis/ingester/tasks/masterthesis_collector.rb, line 48 def process(item) @work_dir = item.work_dir storage = DomainStorage.where(domain: 'Masterthesis').find_or_create_by(name: 'Loaded') loaded = storage.data @file_service ||= parameter(:local_storage).blank? ? Libis::Ingester::FtpsService.new( parameter(:ftp_host), parameter(:ftp_port), parameter(:ftp_user), parameter(:ftp_password) ) : Libis::Ingester::FileService.new(parameter(:local_storage)) dirs = @file_service.ls parameter(:ftp_subdir) dirs.each do |dir| next if @file_service.is_file?(dir) name = File.basename(dir) next unless parameter(:selection_regex).nil? or Regexp.new(parameter(:selection_regex)) =~ name if loaded[name] warn 'Thesis found that is already ingested: \'%s\' [%s]', name, loaded[name] next end process_dir(dir) end end
Private Instance Methods
add_node(xml, node_name) { || ... }
click to toggle source
# File lib/libis/ingester/tasks/masterthesis_collector.rb, line 272 def add_node(xml, node_name) value = yield node_name = "#{node_name.to_s}=" unless node_name.to_s[-1] == '!' xml.send(node_name, value) rescue warn "Could not create metadata field: #{node_name} for #{xml.identifier.text}" end
check_error(errors, msg, *args)
click to toggle source
# File lib/libis/ingester/tasks/masterthesis_collector.rb, line 169 def check_error(errors, msg, *args) errors << (msg % args) error msg, *args false end
check_thesis(dir_name, files, xml_doc, xml_file_name)
click to toggle source
# File lib/libis/ingester/tasks/masterthesis_collector.rb, line 175 def check_thesis(dir_name, files, xml_doc, xml_file_name) check = true errors = [] proeven = xml_doc.root.search('/proeven/proef') if proeven.size == 0 check_error errors, 'XML file in %s does not contain a thesis.', dir_name return false end check = check_error errors, 'XML file in %s contains multiple theses.', dir_name if proeven.size > 1 proef = proeven.first # check it item has title if xml_doc['//titel1/tekst'].strip.blank? check = check_error errors, 'XML entry for %s in does not have a value for titel1/text.', dir_name end # check if files in FTP dir and XML file match hoofdtekst = proef.search('bestanden/hoofdtekst').map(&:text) if hoofdtekst.empty? check = check_error errors, 'XML file for %s missing a main file entry (bestanden/hoofdtekst).', dir_name end if hoofdtekst.size > 1 check = check_error errors, 'XML file for %s has multiple a main file entries (bestanden/hoofdtekst).', dir_name end if hoofdtekst.first.blank? check = check_error errors, 'XML file for %s has an empty main file entry (bestanden/hoofdtekst).', dir_name end bijlagen = proef.search('bestanden/bijlage').map(&:text) files_from_xml = [] files_from_xml += hoofdtekst files_from_xml += bijlagen files_from_xml.each do |fname| unless files.any? {|file| File.basename(file) == fname} check = check_error errors, 'The file \'%s\' listed in the XML for %s is not found on FTP server', fname, dir_name next end end files.each do |file| fname = File.basename(file) unless fname == xml_file_name || files_from_xml.include?(File.basename(file)) check = check_error errors, 'The file \'%s\' was found on the FTP in %s that was not listed in the XML', File.basename(file), dir_name next end end # check if all file entries have a unique name unless files_from_xml.size == files_from_xml.uniq.size files_from_xml.select {|fname| files_from_xml.count(fname) > 1}.uniq.each do |fname| check = check_error errors, 'The file \'%s\' is referenced more than once in the XML file for %s', fname, dir_name end end check ? [proef, files_from_xml] : [false, errors] end
create_metadata(proef, id, instelling_id)
click to toggle source
noinspection RubyResolve @param [Nokogiri::XML::Node] proef source xml data @param [String] id identifier
# File lib/libis/ingester/tasks/masterthesis_collector.rb, line 242 def create_metadata(proef, id, instelling_id) kul_master = (instelling_id == KUL_ID) pub_date = DateTime.now.year xml = ::Libis::Metadata::DublinCoreRecord.new xml.identifier = "#{id}" xml.title = proef.at('titel1').at('tekst').text.strip add_node(xml, :creator) {"#{proef.at('stdnaam').text.strip}, #{proef.at('stdvoornaam').text.strip} (author)"} add_node(xml, :description) {"Dissertation note: Diss #{kul_master ? 'Master ': ''}(#{proef.at('opleidingnaam').text.strip})"} add_node(xml, :publisher) { "#{parameter(:dc_location)}: #{parameter(:dc_institution)}. " + "#{proef.at('faculteitnaam').text.strip}, #{pub_date}" } if kul_master proef.xpath('promotoren/promotor').each do |promotor| add_node(xml, 'contributor!') { "#{promotor.at('naam').text.strip}, #{promotor.at('voornaam').text.strip} (thesis advisor)" } end proef.xpath('copromotoren/copromotor').each do |promotor| add_node(xml, 'contributor!') { "#{promotor.at('naam').text.strip}, #{promotor.at('voornaam').text.strip} (thesis advisor)" } end xml.source = "#{id}" add_node(xml, :rights) { "#{parameter(:dc_institution)}. #{proef.at('faculteitnaam').text.strip} (degree grantor)" } if kul_master xml.date = "#{pub_date}" xml end
process_dir(dir)
click to toggle source
Process one FTP directory @param [String] dir FTP directory to process
# File lib/libis/ingester/tasks/masterthesis_collector.rb, line 76 def process_dir(dir) dir_name = File.basename(dir) debug 'Processing dir %s', dir_name # Check if we already created an IE for this directory if workitem.items.find_by(name: dir_name) debug 'Skipping dir %s as it is already processed: IE exists in run', dir_name return end # Copy all files to local work dir work_dir = File.join(@work_dir, dir_name) FileUtils.mkpath(work_dir) files = @file_service.ls(dir) files = files.map do |file| local_file = File.join(work_dir, File.basename(file)) @file_service.get_file(file, local_file) local_file end # Get the XML file xml_file_name = 'e_thesis.xml' xml_file = files.find {|file| File.basename(file) == xml_file_name} unless xml_file error 'XML file missing in %s', dir_name @file_service.put_file(File.join(parameter(:ftp_errdir), "#{dir_name}.error"), ['XML file missing in %s' % dir_name]) return end # Load and parse the XML file xml_doc = Libis::Tools::XmlDocument.open(xml_file) xml_doc.save(xml_file) # Fix the bad XML that SAP program provides proef, files_from_xml = check_thesis(dir_name, files, xml_doc, xml_file_name) unless proef @file_service.put_file(File.join(parameter(:ftp_errdir), "#{dir_name}.error"), files_from_xml) return end # Check AccessRight embargo = xml_doc['//embargo'].to_i pub = !(xml_doc.embargo('isPubliek').blank?) instelling_id = xml_doc['//instellingId'] || KUL_ID # noinspection RubyNestedTernaryOperatorsInspection ar_extension = embargo == 0 ? (pub ? 'PUBLIC' : 'IP-RESTRICTED') : 'PROTECTED' ar_name = "AR_MT_#{instelling_id}_#{ar_extension}" unless Libis::Ingester::AccessRight.find_by(name: ar_name) error "AccessRight #{ar_name} not found.", workitem set_status(workitem, :FAILED) return end # Create IE for thesis ie_item = Libis::Ingester::IntellectualEntity.new ie_item.name = dir_name ie_item.label = xml_doc['//titel1/tekst'].strip ie_item.properties['source_path'] = dir ie_item.properties['identifier'] = dir_name ie_item.properties['access_right'] = ar_name ie_item.properties['user_a'] = 'Ingest from SAP' ie_item.properties['user_b'] = xml_doc['//voorkeurbib'] # Build Dublin Core record from the rest of the XML ie_item.metadata_record_attributes = { format: 'DC', data: create_metadata(proef, dir_name, instelling_id).to_xml } # Save item workitem << ie_item debug 'Added IE %s \'%s\'', dir_name, ie_item.properties[:title] # add files to IE files_from_xml.each do |fname| file = files.find {|f| File.basename(f) == fname} file_item = Libis::Ingester::FileItem.new file_item.filename = file ie_item << file_item debug 'Added file \'%s\'.', ie_item, fname end # finally add the XML file xml_item = Libis::Ingester::FileItem.new xml_item.filename = xml_file ie_item << xml_item debug 'Added XML file.', ie_item # Save item ie_item.save! end