Price Tracker RulesetΒΆ

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

import {dom, out, rule, ruleset, score, type} from 'fathom-web';
import {ancestors} from 'fathom-web/utilsForFrontend';
import {euclidean} from 'fathom-web/clusters';

const TOP_BUFFER = 150;
// From: https://github.com/mozilla/fathom-trainees/blob/master/src/trainees.js

/**
 * Creates Fathom ruleset instances, and holds individual rule methods for
 * easier testing.
 */
export default class RulesetFactory {
  /** Scores fnode in direct proportion to its size */
  isBig(fnode) {
    const domRect = fnode.element.getBoundingClientRect();
    const area = domRect.width * domRect.height;

    // Assume no product images as small as 80px^2. No further bonus over
    // 1000^2. For one thing, that's getting into background image territory
    // (though we should have distinct penalties for that sort of thing if we
    // care). More importantly, clamp the upper bound of the score so we don't
    // overcome other bonuses and penalties.
    return linearScale(area, 80 ** 2, 1000 ** 2);
  }

  /** Return whether the computed font size of an element is big. */
  fontIsBig(fnode) {
    const size = parseInt(getComputedStyle(fnode.element).fontSize, 10);
    return linearScale(size, 14, 50);
  }

  /** Scores fnode with a '$' in its innerText */
  hasDollarSign(fnode) {
    return (fnode.element.innerText.includes('$') ? 1 : 0);
  }

  /**
   * Return whether some substring is within a given string, case
   * insensitively.
   */
  caselessIncludes(haystack, needle) {
    return haystack.toLowerCase().includes(needle);
  }

  /**
   * Return a weighted confidence of whether a substring is within a given
   * string, case insensitively.
   */
  stringIncludes(haystack, needle) {
    return (this.caselessIncludes(haystack, needle) ? 1 : 0);
  }

  /**
   * Punish elements with "background" in their ID. Do nothing to those without.
   */
  hasBackgroundInID(fnode) {
    return this.caselessIncludes(fnode.element.id, 'background') ? 0 : 1;
  }

  /** Scores fnode with 'price' in its id */
  hasPriceInID(fnode) {
    return this.stringIncludes(fnode.element.id, 'price');
  }

  hasPriceInParentID(fnode) {
    return this.stringIncludes(fnode.element.parentElement.id, 'price');
  }

  /** Scores fnode with 'price' in its class name */
  hasPriceInClassName(fnode) {
    return this.stringIncludes(fnode.element.className, 'price');
  }

  /** Scores fnode with 'price' in its parent's class name */
  hasPriceInParentClassName(fnode) {
    return this.stringIncludes(fnode.element.parentElement.className, 'price');
  }

  /** Scores fnode by its vertical location relative to the fold */
  isAboveTheFold(fnode) {
    const viewportHeight = 950;
    const imageTop = fnode.element.getBoundingClientRect().top;

    // Stop giving additional bonus for anything closer than 200px to the top
    // of the viewport. Those are probably usually headers.
    return linearScale(imageTop, viewportHeight * 2, 200);
  }

  /**
   * Return whether the centerpoint of the element is near that of the highest-
   * scoring image.
   */
  isNearImage(fnode) {
    const imageFnode = this.getHighestScoringImage(fnode);
    return linearScale(euclidean(fnode, imageFnode), 1000, 0);
  }

  /**
   * Return whether the potential title is near the top or bottom of the
   * highest-scoring image.
   *
   * This is a makeshift ORing of 2 signals: a "near the top" and a "near the
   * bottom" one.
   */
  isNearImageTopOrBottom(fnode) {
    const imageElement = this.getHighestScoringImage(fnode).element;
    const imageRect = imageElement.getBoundingClientRect();
    const nodeRect = fnode.element.getBoundingClientRect();

    // Should cover title above image and title in a column next to image.
    // Could also consider using the y-axis midpoint of title.
    const topDistance = Math.abs(imageRect.top - nodeRect.top);

    // Test nodeRect.top. They're probably not side by side with the title at
    // the bottom. Rather, title will be below image.
    const bottomDistance = Math.abs(imageRect.bottom - nodeRect.top);

    const shortestDistance = Math.min(topDistance, bottomDistance);
    return linearScale(shortestDistance, 200, 0);
  }

  /**
   * Return whether the fnode's innertext contains a dollars-and-cents number.
   */
  hasPriceishPattern(fnode) {
    const text = fnode.element.innerText;
    /**
     * With an optional '$' that doesn't necessarily have to be at the beginning
     * of the string (ex: 'US $5.00' on Ebay), matches any number of digits before
     * a decimal point and exactly two after.
     */
    const regExp = /\$?\d+\.\d{2}(?![0-9])/;
    return (regExp.test(text) ? 1 : 0);
  }

  /** Checks to see if a 'price' fnode is eligible for scoring */
  isEligiblePrice(fnode) {
    return (
      this.isVisible(fnode)
      && this.hasDifferentInnerTextThanChildren(fnode)
      && this.isNearbyImageYAxisPrice(fnode)
    );
  }

  /** Checks to see if a 'title' fnode is eligible for scoring */
  isEligibleTitle(fnode) {
    return (
      this.isVisible(fnode)
      // Don't use hasDifferentInnerTextThanChildren, because <h1> tags
      // for Amazon and Walmart have <span> and <div> element children,
      // respectively, with the same innerText.
      //
      // Don't use isNearbyImageYAxisTitle here, as unlike for price, there
      // is a strong correlation for vertical proximity to image, so we want
      // to score it proportionally rather than have a hard cut-off.
    );
  }

  /**
   * Checks if fnode has different innerText compared to any of its children
   */
  hasDifferentInnerTextThanChildren(fnode) {
    const element = fnode.element;
    const children = element.children;
    if (children.length > 0) {
      for (const descendant of children) {
        if (descendant.innerText === element.innerText) {
          return false;
        }
      }
    }
    return true;
  }

  /**
   * Checks if fnode is nearby the top scoring image element in the y-axis
   * Unlike for 'title', 'price' elements had worse accuracy when scored
   * in proportion to y-axis proximity to the image.
   */
  isNearbyImageYAxisPrice(fnode) {
    const element = fnode.element;
    const DOMRect = element.getBoundingClientRect();
    const imageElement = this.getHighestScoringImage(fnode).element;
    const imageDOMRect = imageElement.getBoundingClientRect();
    if (DOMRect.top >= (imageDOMRect.top - TOP_BUFFER)
      && DOMRect.bottom <= imageDOMRect.bottom) {
      return true;
    }
    return false;
  }

  isVisible(fnode) {
    for (const ancestor of ancestors(fnode.element)) {
      const style = getComputedStyle(ancestor);
      const isElementHidden = (
        style.visibility === 'hidden'
        || style.display === 'none'
        || style.opacity === '0'
        || style.width === '0'
        || style.height === '0'
      );
      if (isElementHidden) {
        return false;
      }
    }
    return true;
  }

  hasBackgroundImage(fnode) {
    const bgImage = getComputedStyle(fnode.element)['background-image'];
    return bgImage && bgImage.startsWith('url("') && bgImage.endsWith('")');
    // The app doesn't yet know how to deal with non-url() types of background
    // images.
  }

  /**
   * Return the aspect ratio of a fnode's client rect, with horizontal and
   * vertical rearranged so it's always >=1.
   */
  aspectRatio(element) {
    const rect = element.getBoundingClientRect();
    if (rect.width === 0 || rect.height === 0) {
      return Infinity;
    }
    return (rect.width > rect.height) ? (rect.width / rect.height) : (rect.height / rect.width);
  }

  /** Give a bonus for elements that have a non-extreme aspect ratio. */
  hasSquareAspectRatio(fnode) {
    return linearScale(this.aspectRatio(fnode.element), 10, 5);
  }

  /**
  * Using coefficients passed into the constructor method, returns a weighted
  * ruleset used to score elements in an HTML document.
  */
  makeRuleset(coeffs, biases) {
    return ruleset([
      /**
       * Image rules
       */
      // consider all visible img elements
      rule(dom('img').when(this.isVisible.bind(this)), type('image')),
      // and divs, which sometimes have CSS background-images
      // TODO: Consider a bonus for <img> tags.
      rule(dom('div').when(fnode => this.isVisible(fnode) && this.hasBackgroundImage(fnode)), type('image')),
      // better score the closer the element is to the top of the page
      rule(type('image'), score(this.isAboveTheFold.bind(this)), {name: 'isAboveTheFoldImage'}),
      // better score for larger images
      rule(type('image'), score(this.isBig.bind(this)), {name: 'isBig'}),
      // bonus for non-extreme aspect ratios, to filter out banners or nav elements
      // TODO: Meant to make this a penalty, but it turns out to work as is.
      // Try as a penalty.
      rule(type('image'), score(this.hasSquareAspectRatio.bind(this)), {name: 'hasSquareAspectRatio'}),
      // no background images, even ones that have reasonable aspect ratios
      // TODO: If necessary, also look at parents. I've seen them say
      // "background" in their IDs as well.
      rule(type('image'), score(this.hasBackgroundInID.bind(this)), {name: 'hasBackgroundInID'}),
      // return image element(s) with max score
      rule(type('image').max(), out('image')),

      /**
       * Title rules
       */
      // consider all eligible h1 elements
      rule(dom('h1').when(this.isEligibleTitle.bind(this)), type('title')),
      // better score based on y-axis proximity to max scoring image element
      rule(type('title'), score(this.isNearImageTopOrBottom.bind(this)), {name: 'isNearImageTopOrBottom'}),
      // return title element(s) with max score
      rule(type('title').max(), out('title')),

      /**
       * Price rules
       */
      // 72% by itself, at [4, 4, 4, 4...]!:
      // consider all eligible span and h2 elements
      rule(dom('span, h2').when(this.isEligiblePrice.bind(this)), type('price')),
      // check if the element has a '$' in its innerText
      rule(type('price'), score(this.hasDollarSign.bind(this)), {name: 'hasDollarSign'}),
      // better score the closer the element is to the top of the page
      rule(type('price'), score(this.isAboveTheFold.bind(this)), {name: 'isAboveTheFoldPrice'}),
      // check if the id has "price" in it
      rule(type('price'), score(this.hasPriceInID.bind(this)), {name: 'hasPriceInID'}),
      rule(type('price'), score(this.hasPriceInParentID.bind(this)), {name: 'hasPriceInParentID'}),
      // check if any class names have "price" in them
      rule(type('price'), score(this.hasPriceInClassName.bind(this)), {name: 'hasPriceInClassName'}),
      rule(type('price'), score(this.hasPriceInParentClassName.bind(this)), {name: 'hasPriceInParentClassName'}),
      // better score for larger font size
      rule(type('price'), score(this.fontIsBig.bind(this)), {name: 'fontIsBig'}),
      // better score based on x-axis proximity to max scoring image element
      rule(type('price'), score(this.isNearImage.bind(this)), {name: 'isNearImage'}),
      // check if innerText has a price pattern
      rule(type('price'), score(this.hasPriceishPattern.bind(this)), {name: 'hasPriceishPattern'}),
      // return price element(s) with max score
      rule(type('price').max(), out('price')),
    ],
    coeffs,
    biases);
  }

  getHighestScoringImage(fnode) {
    return fnode._ruleset.get('image')[0]; // eslint-disable-line no-underscore-dangle
  }
}

/**
 * Scale a number to the range [0, 1].
 *
 * For a rising line, the result is 0 until the input reaches
 * zeroAt, then increases linearly until oneAt, at which it becomes 1. To
 * make a falling line, where the result is 1 to the left and 0
 * to the right, use a zeroAt greater than oneAt.
 */
function linearScale(number, zeroAt, oneAt) {
  const isRising = zeroAt < oneAt;
  if (isRising) {
    if (number <= zeroAt) {
      return 0;
    }
    if (number >= oneAt) {
      return 1;
    }
  } else {
    if (number >= zeroAt) {
      return 0;
    }
    if (number <= oneAt) {
      return 1;
    }
  }
  const slope = (1 - 0) / (oneAt - zeroAt);
  return slope * (number - zeroAt) + 0;
}