Class: DhEasy::Core::Mock::FakeDb

Inherits:
Object
  • Object
show all
Defined in:
lib/dh_easy/core/mock/fake_db.rb

Overview

Fake in memory database that emulates `DataHen` database objects' black box behavior.

Constant Summary collapse

PAGE_KEYS =

Page id keys, analog to primary keys.

['gid'].freeze
OUTPUT_KEYS =

Output id keys, analog to primary keys.

['_id', '_collection'].freeze
JOB_KEYS =

Job id keys, analog to primary keys.

['job_id'].freeze
JOB_STATUSES =

Job available status.

{
  active: 'active',
  done: 'done',
  cancelled: 'cancelled',
  paused: 'paused'
}
DEFAULT_COLLECTION =

Default collection for saved outputs

'default'
DEFAULT_FETCH_TYPE =

Default page's fetch type

'standard'
DEFAULT_UUID_ALGORITHM =

Default uuid algorithm

:md5
VALID_UUID_ALGORITHMS =

Valid uuid algorithms

[:md5, :sha1, :sha256]

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(opts = {}) ⇒ FakeDb

Initialize fake database.

Parameters:

  • opts (Hash) (defaults to: {})

    ({}) Configuration options.

Options Hash (opts):

  • :job_id (Integer, nil)

    Job id default value.

  • :scraper_name (String, nil)

    Scraper name default value.

  • :page_gid (String, nil)

    Page gid default value.

  • :allow_page_gid_override (Boolean, nil) — default: false

    Specify whenever page gid can be overrided on page or output insert.

  • :allow_job_id_override (Boolean, nil) — default: false

    Specify whenever job id can be overrided on page or output insert.

  • :uuid_algorithm (Enumerator, nil) — default: :md5

    Specify the algorithm to be used to generate UUID values.



380
381
382
383
384
385
386
387
# File 'lib/dh_easy/core/mock/fake_db.rb', line 380

def initialize opts = {}
  self.job_id = opts[:job_id]
  self.scraper_name = opts[:scraper_name]
  self.page_gid = opts[:page_gid]
  self.uuid_algorithm = opts[:uuid_algorithm]
  @allow_page_gid_override = opts[:allow_page_gid_override].nil? ? false : !!opts[:allow_page_gid_override]
  @allow_job_id_override = opts[:allow_job_id_override].nil? ? false : !!opts[:allow_job_id_override]
end

Class Method Details

.build_fake_job(opts = {}) ⇒ Hash

Build a fake job by using FakeDb engine.

Parameters:

  • opts (Hash) (defaults to: {})

    ({}) Configuration options (see #initialize).

Options Hash (opts):

  • :scraper_name (String) — default: nil

    Scraper name.

  • :job_id (Integer) — default: nil

    Job id.

  • :status (String) — default: 'done'

    .

Returns:

  • (Hash)


243
244
245
246
247
248
249
250
# File 'lib/dh_easy/core/mock/fake_db.rb', line 243

def self.build_fake_job opts = {}
  job = {
    'job_id' => opts[:job_id],
    'scraper_name' => opts[:scraper_name],
    'status' => (opts[:status] || 'done')
  }
  build_job job, opts
end

.build_fake_page(opts = {}) ⇒ Hash

Build a fake page by using FakeDb engine.

Parameters:

  • opts (Hash) (defaults to: {})

    ({}) Configuration options (see #initialize).

Options Hash (opts):

  • :url (String) — default: 'https://example.com'

    Page url.

Returns:

  • (Hash)


90
91
92
93
94
95
# File 'lib/dh_easy/core/mock/fake_db.rb', line 90

def self.build_fake_page opts = {}
  page = {
    'url' => (opts[:url] || 'https://example.com')
  }
  build_page page, opts
end

.build_job(job, opts = {}) ⇒ Hash

Build a job with defaults by using FakeDb engine.

Parameters:

  • job (Hash)

    Job initial values.

  • opts (Hash) (defaults to: {})

    ({}) Configuration options (see #initialize).

Returns:

  • (Hash)


229
230
231
232
233
# File 'lib/dh_easy/core/mock/fake_db.rb', line 229

def self.build_job job, opts = {}
  temp_db = DhEasy::Core::Mock::FakeDb.new opts
  temp_db.jobs << job
  temp_db.jobs.last
end

.build_page(page, opts = {}) ⇒ Hash

Build a page with defaults by using FakeDb engine.

Parameters:

  • page (Hash)

    Page initial values.

  • opts (Hash) (defaults to: {})

    ({}) Configuration options (see #initialize).

Returns:

  • (Hash)


74
75
76
77
78
79
80
81
82
# File 'lib/dh_easy/core/mock/fake_db.rb', line 74

def self.build_page page, opts = {}
  opts = {
    allow_page_gid_override: true,
    allow_job_id_override: true
  }.merge opts
  temp_db = DhEasy::Core::Mock::FakeDb.new opts
  temp_db.pages << page
  temp_db.pages.first
end

.clean_uri(raw_url) ⇒ String

Clean an URL to remove fragment, lowercase schema and host, and sort

query string.

Parameters:

  • raw_url (String)

    URL to clean.

Returns:

  • (String)


129
130
131
# File 'lib/dh_easy/core/mock/fake_db.rb', line 129

def self.clean_uri raw_url
  clean_uri_obj(raw_url).to_s
end

.clean_uri_obj(raw_url) ⇒ URI::HTTPS

Clean an URL to remove fragment, lowercase schema and host, and sort

query string.

Parameters:

  • raw_url (String)

    URL to clean.

Returns:

  • (URI::HTTPS)


103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/dh_easy/core/mock/fake_db.rb', line 103

def self.clean_uri_obj raw_url
  url = URI.parse(raw_url)
  url.hostname = url.hostname.downcase
  url.fragment = nil

  # Sort query string keys
  unless url.query.nil?
    query_string = CGI.parse(url.query)
    keys = query_string.keys.sort
    data = []
    keys.each do |key|
      query_string[key].each do |value|
        data << "#{URI.encode key}=#{URI.encode value}"
      end
    end
    url.query = data.join('&')
  end
  url
end

.fake_uuid(seed = nil, algorithm = nil) ⇒ String

Generate a fake UUID.

Parameters:

  • seed (nil) (defaults to: nil)

    Object to use as seed for uuid.

  • algorithm (Enumerator) (defaults to: nil)

    (nil) Algorithm to use: sha256 (default), sha1, md5.

Returns:

  • (String)


44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/dh_easy/core/mock/fake_db.rb', line 44

def self.fake_uuid seed = nil, algorithm = nil
  seed ||= (Time.new.to_f + rand)
  algorithm ||= DEFAULT_UUID_ALGORITHM
  case algorithm
  when :sha256
    Digest::SHA256.hexdigest seed.to_s
  when :sha1
    Digest::SHA1.hexdigest seed.to_s
  else
    Digest::MD5.hexdigest seed.to_s
  end
end

.new_collection(keys, opts = {}) ⇒ DhEasy::Core::SmartCollection

Generate a smart collection with keys and initial values.

Parameters:

  • keys (Array)

    Analog to primary keys, combination will be uniq.

  • opts (Hash) (defaults to: {})

    Configuration options (see DhEasy::Core::SmartCollection#initialize).

Returns:



34
35
36
# File 'lib/dh_easy/core/mock/fake_db.rb', line 34

def self.new_collection keys, opts = {}
  DhEasy::Core::SmartCollection.new keys, opts
end

.output_uuid(data, uuid_algorithm = nil) ⇒ String

Generate a fake UUID based on output fields without `_` prefix.

Parameters:

  • data (Hash)

    Output data.

  • uuid_algorithm (Enumerator) (defaults to: nil)

    (nil) Algorithm to use: sha256 (default), sha1, md5.

Returns:

  • (String)


63
64
65
66
# File 'lib/dh_easy/core/mock/fake_db.rb', line 63

def self.output_uuid data, uuid_algorithm = nil
  seed = data.select{|k,v|k.to_s =~ /^[^_]/}.hash
  fake_uuid seed, uuid_algorithm
end

.time_stamp(time = nil) ⇒ String

Return a timestamp

Parameters:

  • time (Time) (defaults to: nil)

    (nil) Time from which to get time stamp.

Returns:

  • (String)


257
258
259
260
# File 'lib/dh_easy/core/mock/fake_db.rb', line 257

def self.time_stamp time = nil
  time = Time.new if time.nil?
  time.utc.strftime('%FT%T.%6N').gsub(/[0.]+\Z/,'') << "Z"
end

Instance Method Details

#allow_job_id_override?Boolean

Specify whenever job id overriding by user is allowed on page or

output insert.

Returns:

  • (Boolean)

    `true` when allowed, else `false`.



364
365
366
# File 'lib/dh_easy/core/mock/fake_db.rb', line 364

def allow_job_id_override?
  @allow_job_id_override ||= false
end

#allow_page_gid_override?Boolean

Specify whenever page gid overriding by user is allowed on page or

output insert.

Returns:

  • (Boolean)

    `true` when allowed, else `false`.



346
347
348
# File 'lib/dh_easy/core/mock/fake_db.rb', line 346

def allow_page_gid_override?
  @allow_page_gid_override ||= false
end

#disable_job_id_overrideObject

Disable job id override on page or output insert.



356
357
358
# File 'lib/dh_easy/core/mock/fake_db.rb', line 356

def disable_job_id_override
  @allow_job_id_override = false
end

#disable_page_gid_overrideObject

Disable page gid override on page or output insert.



338
339
340
# File 'lib/dh_easy/core/mock/fake_db.rb', line 338

def disable_page_gid_override
  @allow_page_gid_override = false
end

#enable_job_id_overrideObject

Enable job id override on page or output insert.



351
352
353
# File 'lib/dh_easy/core/mock/fake_db.rb', line 351

def enable_job_id_override
  @allow_job_id_override = true
end

#enable_page_gid_overrideObject

Enable page gid override on page or output insert.



333
334
335
# File 'lib/dh_easy/core/mock/fake_db.rb', line 333

def enable_page_gid_override
  @allow_page_gid_override = true
end

#ensure_job(target_job_id = nil) ⇒ Hash

Get current job or create new one from values.

Parameters:

  • target_job_id (Integer) (defaults to: nil)

    (nil) Job id to ensure existance.

Returns:

  • (Hash)


267
268
269
270
271
272
273
274
275
276
277
278
# File 'lib/dh_easy/core/mock/fake_db.rb', line 267

def ensure_job target_job_id = nil
  target_job_id = job_id if target_job_id.nil?
  job = jobs.find{|v|v['job_id'] == target_job_id}
  return job unless job.nil?
  job = {
    'job_id' => target_job_id,
    'scraper_name' => scraper_name,
  }
  job['status'] = 'active' unless target_job_id != job_id
  jobs << job
  jobs.last
end

#fake_uuid(seed = nil) ⇒ String

Generate a fake UUID using the configured uuid algorithm.

Parameters:

  • seed (nil) (defaults to: nil)

    Object to use as seed for uuid.

Returns:

  • (String)


394
395
396
# File 'lib/dh_easy/core/mock/fake_db.rb', line 394

def fake_uuid seed = nil
  self.class.fake_uuid seed, self.uuid_algorithm
end

#generate_job_idInteger

Generate a fake job_id.

Returns:

  • (Integer)


408
409
410
# File 'lib/dh_easy/core/mock/fake_db.rb', line 408

def generate_job_id
  jobs.count < 1 ? 1 : (jobs.max{|a,b|a['job_id'] <=> b['job_id']}['job_id'] + 1)
end

#generate_output_id(data) ⇒ String

Generate a fake UUID for outputs.

Parameters:

  • data (Hash)

    Output data.

Returns:

  • (String)


600
601
602
603
# File 'lib/dh_easy/core/mock/fake_db.rb', line 600

def generate_output_id data
  # Generate random UUID to match Datahen behavior
  self.fake_uuid
end

#generate_page_gid(page_data) ⇒ String

Generate a fake UUID based on page data:

* url
* method
* headers
* fetch_type
* cookie
* no_redirect
* body
* ua_type

Parameters:

  • page_data (Hash)

    Page data.

Returns:

  • (String)


455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
# File 'lib/dh_easy/core/mock/fake_db.rb', line 455

def generate_page_gid page_data
  # ensure page url
  return "" if page_data['url'].nil? || page_data['url'].to_s.strip === ''

  # calculate extra fields, keep field order to match datahen
  data = []
  data << "method:#{page_data['method'].to_s.downcase}"
  no_url_encode = (!page_data['no_url_encode'].nil? && !!page_data['no_url_encode'])
  uri = self.class.clean_uri_obj(page_data['url'])
  url = (no_url_encode ? page_data['url'].to_s.lstrip : uri.to_s)
  data << "url:#{url}"
  headers = self.class.format_headers page_data['headers']
  data << "headers:#{headers}"
  data << "body:#{page_data['body'].to_s}"
  no_redirect = (!page_data['no_redirect'].nil? && !!page_data['no_redirect'])
  data << "no_redirect:#{no_redirect.to_s}"
  ua_type = (page_data['ua_type'].to_s === '') ? 'desktop' : page_data['ua_type']
  data << "ua_type:#{ua_type}"

  # complex fields
  data << "fetch_type:#{page_data['fetch_type']}" unless self.class.is_default_fetch_type? page_data['fetch_type']
  # keep this cookie logic to match datahen
  data << "cookie:#{page_data['cookie'].split(/;\s*/).sort.join(';')}" if page_data['cookie'].to_s.strip != ''
  data << "http2:true" if page_data.has_key?('http2') && !page_data['http2'].nil? && !!page_data['http2']
  data << "driverName:#{page_data['driver']['name']}" unless self.class.is_driver_empty? page_data['driver']
  unless self.class.is_display_empty? page_data['display']
    data << "display:#{page_data['display']['width']}x#{page_data['display']['height']}"
  end
  unless self.class.is_screenshot_empty? page_data['screenshot']
    checksum = self.fake_uuid JSON.generate(page_data['screenshot'])
    data << "screenshot:#{checksum}"
  end

  # generate GID
  seed = data.join('|')
  checksum = self.fake_uuid seed
  "#{uri.hostname}-#{checksum}"
end

#generate_scraper_nameString

Generate a fake scraper name.

Returns:

  • (String)


401
402
403
# File 'lib/dh_easy/core/mock/fake_db.rb', line 401

def generate_scraper_name
  Faker::Internet.unique.slug
end

#job_idInteger?

Fake job id.

Returns:

  • (Integer, nil)


295
296
297
# File 'lib/dh_easy/core/mock/fake_db.rb', line 295

def job_id
  @job_id ||= generate_job_id
end

#job_id=(value) ⇒ Object

Set fake job id value.



300
301
302
303
304
# File 'lib/dh_easy/core/mock/fake_db.rb', line 300

def job_id= value
  @job_id = value
  ensure_job
  job_id
end

#jobsDhEasy::Core::SmartCollection

Stored job collection



428
429
430
431
432
433
434
435
436
437
438
439
440
# File 'lib/dh_easy/core/mock/fake_db.rb', line 428

def jobs
  return @jobs unless @jobs.nil?
  collection = self.class.new_collection JOB_KEYS,
    defaults: job_defaults
  collection.bind_event(:before_defaults) do |collection, raw_item|
    DhEasy::Core.deep_stringify_keys raw_item
  end
  collection.bind_event(:before_insert) do |collection, item, match|
    item['job_id'] ||= generate_job_id
    item
  end
  @jobs ||= collection
end

#outputsDhEasy::Core::SmartCollection

Stored output collection



621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
# File 'lib/dh_easy/core/mock/fake_db.rb', line 621

def outputs
  return @outputs unless @outputs.nil?
  collection = self.class.new_collection OUTPUT_KEYS,
    defaults: output_defaults
  collection.bind_event(:before_defaults) do |collection, raw_item|
    item = DhEasy::Core.deep_stringify_keys raw_item
    item.delete '_job_id' unless allow_job_id_override?
    item.delete '_gid_id' unless allow_page_gid_override?
    item
  end
  collection.bind_event(:before_insert) do |collection, item, match|
    item['_id'] ||= generate_output_id item
    item
  end
  collection.bind_event(:after_insert) do |collection, item|
    ensure_job item['_job_id']
  end
  @outputs ||= collection
end

#page_gidInteger?

Current fake page gid.

Returns:

  • (Integer, nil)


308
309
310
# File 'lib/dh_easy/core/mock/fake_db.rb', line 308

def page_gid
  @page_gid ||= self.fake_uuid
end

#page_gid=(value) ⇒ Object

Set current fake page gid value.



313
314
315
# File 'lib/dh_easy/core/mock/fake_db.rb', line 313

def page_gid= value
  @page_gid = value
end

#pagesDhEasy::Core::SmartCollection

Note:

Page gid will be replaced on insert by an auto generated uuid unless page gid overriding is enabled (see #allow_page_gid_override?)

Stored page collection.



550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
# File 'lib/dh_easy/core/mock/fake_db.rb', line 550

def pages
  return @pages unless @page.nil?

  defaults = self.page_defaults
  collection = self.class.new_collection PAGE_KEYS,
    defaults: defaults
  collection.bind_event(:before_defaults) do |collection, raw_item|
    item = DhEasy::Core.deep_stringify_keys raw_item
    if !item['driver'].nil? && item['driver'].is_a?(Hash)
      item['driver'] = defaults['driver'].merge item['driver']
    end
    if !item['display'].nil? && item['display'].is_a?(Hash)
      item['display'] = defaults['display'].merge item['display']
    end
    if !item['screenshot'].nil? && item['screenshot'].is_a?(Hash)
      item['screenshot'] = defaults['screenshot'].merge item['screenshot']
    end
    item.delete 'job_id' unless allow_job_id_override?
    item
  end
  collection.bind_event(:before_insert) do |collection, item, match|
    item['driver'] = nil if self.class.is_driver_empty? item['driver']
    item['display'] = nil if self.class.is_display_empty? item['display']
    item['screenshot'] = nil if self.class.is_screenshot_empty? item['screenshot']
    item['headers'] = nil if self.class.is_hash_empty? item['headers']
    item['vars'] = nil if self.class.is_hash_empty? item['vars']
    uri = self.class.clean_uri_obj(item['url'])
    item['hostname'] = uri.hostname
    uri = nil
    if item['gid'].nil? || !allow_page_gid_override?
      item['gid'] = generate_page_gid item
    end

    # 30 days = 60 * 60 * 24 * 30 = 2592000
    item['freshness'] ||= self.class.time_stamp (Time.now - 2592000)
    item['to_fetch'] ||= self.class.time_stamp
    item['created_at'] ||= self.class.time_stamp
    item
  end
  collection.bind_event(:after_insert) do |collection, item|
    ensure_job item['job_id']
  end
  @pages ||= collection
end

#query(collection, filter, offset = 0, limit = nil) ⇒ Object

Note:

Warning: It uses table scan to filter and should be used on test suites only.

Search items from a collection.

Parameters:

  • collection (Symbol)

    Allowed values: `:outputs`, `:pages`.

  • filter (Hash)

    Filters to query.

  • offset (Integer) (defaults to: 0)

    (0) Search results offset.

  • limit (Integer, nil) (defaults to: nil)

    (nil) Limit search results count. Set to `nil` for unlimited.

Raises:

  • ArgumentError On unknown collection.



668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
# File 'lib/dh_easy/core/mock/fake_db.rb', line 668

def query collection, filter, offset = 0, limit = nil
  return [] unless limit.nil? || limit > 0

  # Get collection items
  items = case collection
  when :outputs
    outputs
  when :pages
    pages
  when :jobs
    jobs
  else
    raise ArgumentError.new "Unknown collection #{collection}."
  end

  # Search items
  count = 0
  matches = []
  items.each do |item|
    next unless match? item, filter
    count += 1

    # Skip until offset
    next unless offset < count
    # Break on limit reach
    break unless limit.nil? || matches.count < limit
    matches << item
  end
  matches
end

#refetch(job_id, gid) ⇒ Object

Refetch a page.

Parameters:

  • job_id (Integer)

    Page's job_id to refetch.

  • gid (String)

    Page's gid to refetch.

Raises:



703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
# File 'lib/dh_easy/core/mock/fake_db.rb', line 703

def refetch job_id, gid
  page = pages.find_match('gid' => gid, 'job_id' => job_id)
  raise Exception.new("Page not found with job_id \"#{job_id}\" gid \"#{gid}\"") if page.nil?
  page['status'] = 'to_fetch'
  page['freshness'] = self.class.time_stamp
  page['to_fetch'] = self.class.time_stamp
  page['fetched_from'] = nil
  page['fetching_at'] = '2001-01-01T00:00:00Z'
  page['fetched_at'] = nil
  page['fetching_try_count'] = 0
  page['effective_url'] = nil
  page['parsing_at'] = nil
  page['parsing_failed_at'] = nil
  page['parsed_at'] = nil
  page['parsing_try_count'] = 0
  page['parsing_fail_count'] = 0
  page['parsing_updated_at'] = '2001-01-01T00:00:00Z'
  page['response_checksum'] = nil
  page['response_status'] = nil
  page['response_status_code'] = nil
  page['response_headers'] = nil
  page['response_cookie'] = nil
  page['response_proto'] = nil
  page['content_type'] = nil
  page['content_size'] = 0
  page['failed_response_status_code'] = nil
  page['failed_response_headers'] = nil
  page['failed_response_cookie'] = nil
  page['failed_effective_url'] = nil
  page['failed_at'] = nil
  page['failed_content_type'] = nil
end

#reparse(job_id, gid) ⇒ Object

Reparse a page.

Parameters:

  • job_id (Integer)

    Page's job_id to reparse.

  • gid (String)

    Page's gid to reparse.

Raises:



740
741
742
743
744
745
746
747
748
749
750
# File 'lib/dh_easy/core/mock/fake_db.rb', line 740

def reparse job_id, gid
  page = pages.find_match('gid' => gid, 'job_id' => job_id)
  raise Exception.new("Page not found with job_id \"#{job_id}\" gid \"#{gid}\"") if page.nil?
  page['status'] = 'to_parse'
  page['parsing_at'] = nil
  page['parsing_failed_at'] = nil
  page['parsing_updated_at'] = '2001-01-01T00:00:00Z'
  page['parsed_at'] = nil
  page['parsing_try_count'] = 0
  page['parsing_fail_count'] = 0
end

#scraper_nameString?

Fake scraper_name.

Returns:

  • (String, nil)


282
283
284
# File 'lib/dh_easy/core/mock/fake_db.rb', line 282

def scraper_name
  @scraper_name ||= 'my_scraper'
end

#scraper_name=(value) ⇒ Object

Set fake scraper_name value.



287
288
289
290
291
# File 'lib/dh_easy/core/mock/fake_db.rb', line 287

def scraper_name= value
  job = ensure_job
  @scraper_name = value
  job['scraper_name'] = scraper_name
end

#uuid_algorithmEnumerator?

Current UUID algorithm.

Returns:

  • (Enumerator, nil)


319
320
321
# File 'lib/dh_easy/core/mock/fake_db.rb', line 319

def uuid_algorithm
  @uuid_algorithm ||= DEFAULT_UUID_ALGORITHM
end

#uuid_algorithm=(value) ⇒ Object

Set current UUID algorithm value.

Raises:

  • (ArgumentError)

    Whenever an invalid algorithm is provided



325
326
327
328
329
330
# File 'lib/dh_easy/core/mock/fake_db.rb', line 325

def uuid_algorithm= value
  unless value.nil? || VALID_UUID_ALGORITHMS.include?(value)
    raise ArgumentError.new("Invalid UUID algorithm, valid values are :md5, :sha1, :sha256")
  end
  @uuid_algorithm = value
end