Example RulesetΒΆ
This is the simple example ruleset that ships with FathomFox; it is made available for experimentation when you run fathom fox without passing in your own ruleset. In its comments, it documents the structure of the trainees
object, which is what the trainer needs to do its job.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | import {ruleset, rule, dom, type, score, out, utils} from 'fathom-web';
const {ancestors, isVisible, linearScale, rgbaFromString, saturation} = utils;
/**
* Rulesets to vectorize or debug (and metadata about them)
*
* More mechanically, a map of names to {coeffs, rulesetMaker, ...} objects,
* which we call "trainees". The rulesets you specify here are available to the
* trainer and also show up in the FathomFox UI, from which you can debug a
* ruleset. Most often, all the entries here point to the same ruleset but have
* different values of `vectorType` for separately training each type of thing
* the ruleset recognizes.
*/
const trainees = new Map();
/**
* An example ruleset. Replace it with your own.
*
* This one finds the full-screen, content-blocking overlays that often go
* behind modal popups. It's not the most well-honed thing, but it's simple and
* short.
*/
trainees.set(
// The ID for this trainee, which must be the same as the Fathom type you
// are evaluating, if you are using the FathomFox Evaluator:
'overlay',
// Here we paste in coefficients from ``fathom train``. This lets us use
// the Evaluator to see what Fathom is getting wrong. Otherwise, these
// numbers do nothing until you deploy your application, so there's no need
// to maintain them until then.
{coeffs: new Map([ // [rule name, coefficient]
['big', 50.4946],
['nearlyOpaque', 48.6396],
['monochrome', 42.8406],
['classOrId', 0.5005],
['visible', 55.8750]]),
// Bias is -139.3106 for this example, though that isn't needed until
// production.
// The content-area size to use while training. Defaults to 1024x768.
viewportSize: {width: 1024, height: 768},
// The type of node to extract features from when using the Vectorizer.
// Defaults to the trainee ID.
//
// vectorType: 'overlay',
rulesetMaker:
function () {
/**
* Return whether the passed-in div is the size of the whole viewport/document
* or nearly so.
*/
function big(fnode) {
// Compare the size of the fnode to the size of the viewport. So far, spot-
// checking shows the overlay is never the size of the whole document, just
// the viewport.
const rect = fnode.element.getBoundingClientRect();
const hDifference = Math.abs(rect.height - window.innerHeight);
const wDifference = Math.abs(rect.width - window.innerWidth);
return linearScale(hDifference + wDifference, 250, 0); // 250px is getting into "too tall to just be nav or something" territory.
}
/**
* Return whether the fnode is almost but not entirely opaque.
*/
function nearlyOpaque(fnode) {
const style = getComputedStyle(fnode.element);
const opacity = parseFloat(style.getPropertyValue('opacity'));
let bgColorAlpha = rgbaFromString(style.getPropertyValue('background-color'))[3];
if (bgColorAlpha === undefined) {
bgColorAlpha = 1;
}
const totalOpacity = opacity * bgColorAlpha;
let ret;
if (totalOpacity === 1) { // seems to work even though a float
ret = 0;
} else {
ret = linearScale(totalOpacity, .4, .6);
}
return ret;
}
/**
* Return whether the fnode's bgcolor is nearly black or white.
*/
function monochrome(fnode) {
const rgba = rgbaFromString(getComputedStyle(fnode.element).getPropertyValue('background-color'));
return linearScale(1 - saturation(...rgba), .96, 1);
}
function suspiciousClassOrId(fnode) {
const element = fnode.element;
const attributeNames = ['class', 'id'];
let numOccurences = 0;
function numberOfSuspiciousSubstrings(value) {
return value.includes('popup') + value.includes('modal') + value.includes('overlay') + value.includes('underlay') + value.includes('backdrop');
}
for (const name of attributeNames) {
let values = element.getAttribute(name);
if (values) {
if (!Array.isArray(values)) {
values = [values];
}
for (const value of values) {
numOccurences += numberOfSuspiciousSubstrings(value);
}
}
}
// 1 occurrence gets us to about 75% certainty; 2, 92%. It bottoms
// out at 0 and tops out at 1.
// TODO: Figure out how to derive the magic number .1685 from
// 0 and 1.
return (-(.3 ** (numOccurences + .1685)) + 1);
}
/* The actual ruleset */
const rules = ruleset([
// Consider all <div> tags as candidate overlays:
rule(dom('div'), type('overlay')),
// Contribute the "bigness" of the node to its overlay score:
rule(type('overlay'), score(big), {name: 'big'}),
// Contibute the opacity of the node to its overlay score:
rule(type('overlay'), score(nearlyOpaque), {name: 'nearlyOpaque'}),
// Contribute some other signals as well:
rule(type('overlay'), score(monochrome), {name: 'monochrome'}),
rule(type('overlay'), score(suspiciousClassOrId), {name: 'classOrId'}),
rule(type('overlay'), score(isVisible), {name: 'visible'}),
// Offer the max-scoring overlay-typed node under the output key
// "overlay". The score on that node will represent the probability,
// informed by a corpus of training samples, that the node is, indeed,
// a pop-up overlay.
rule(type('overlay').max(), out('overlay'))
]);
return rules;
}
// isTarget is an optional function which returns whether the Vectorizer
// should consider a fnode a target. The default is to consider it a
// target iff its ``data-fathom`` attribute === the trainee ID.
//
// isTarget: fnode => fnode.element.dataset.fathom === 'foo'
}
);
export default trainees;
|