wip
This commit is contained in:
403
scripts/dom-export/exportHtmlDumps.ts
Normal file
403
scripts/dom-export/exportHtmlDumps.ts
Normal file
@@ -0,0 +1,403 @@
|
||||
// Developer-only scripts moved out of infrastructure: DOM exporter for local HTML dumps.
|
||||
// NOT for production automation; intended as a developer utility to generate compact DOM exports
|
||||
// for manual inspection and to aid writing Playwright automations.
|
||||
//
|
||||
// Usage (from repo root):
|
||||
// npm install -D playwright ts-node typescript @types/node
|
||||
// npx playwright install
|
||||
// npx ts-node scripts/dom-export/exportHtmlDumps.ts
|
||||
//
|
||||
// Output: ./html-dumps-optimized/*.json
|
||||
//
|
||||
// This file intentionally contains both the in-page extractor string (exported) and the
|
||||
// Playwright runner that iterates ./html-dumps/*.html and writes .json files into
|
||||
// ./html-dumps-optimized. These artifacts are developer helpers and must not be imported
|
||||
// into production automation code.
|
||||
|
||||
const { chromium } = require("playwright");
|
||||
const fs = require("fs").promises;
|
||||
const path = require("path");
|
||||
|
||||
const INPUT_DIR = path.join(process.cwd(), "html-dumps");
|
||||
const OUTPUT_DIR = path.join(process.cwd(), "html-dumps-optimized");
|
||||
|
||||
// Developer helper: in-page DOM extractor string (for console or page.evaluate).
|
||||
// Kept as a plain const to avoid ES module import/export issues when running with ts-node in CJS mode.
|
||||
// This version compresses output aggressively using short tags/fields and a semantic short DOM path.
|
||||
const domExportScript = `(() => {
|
||||
const MAX_TEXT = 60;
|
||||
|
||||
const clean = t =>
|
||||
t ? t.replace(/\\s+/g, " ").trim().slice(0, MAX_TEXT) : null;
|
||||
|
||||
const isDynamicId = id =>
|
||||
id && (id.includes(":-") || /:[a-z0-9]+:/i.test(id));
|
||||
|
||||
const shortTag = t => ({
|
||||
BUTTON: "bu",
|
||||
A: "a",
|
||||
INPUT: "in",
|
||||
SELECT: "s",
|
||||
TEXTAREA: "ta",
|
||||
DIV: "d",
|
||||
SPAN: "sp"
|
||||
}[t] || t.toLowerCase());
|
||||
|
||||
const isNoiseClass = c =>
|
||||
!c ||
|
||||
c.length < 3 ||
|
||||
/^css-/.test(c) ||
|
||||
/^[a-z0-9]{6,}$/i.test(c) ||
|
||||
/^\\w{1,3}-\\w{4,}$/.test(c);
|
||||
|
||||
const siblingIndex = node => {
|
||||
const sib = [...node.parentNode.children]
|
||||
.filter(n => n.tagName === node.tagName);
|
||||
return { idx: sib.indexOf(node), count: sib.length };
|
||||
};
|
||||
|
||||
const getSemSiblingPath = el => {
|
||||
const parts = [];
|
||||
let node = el;
|
||||
let depth = 0;
|
||||
|
||||
while (node && node.nodeType === 1 && node !== document.body && depth < 5) {
|
||||
const { idx, count } = siblingIndex(node);
|
||||
const isTarget = node === el;
|
||||
const tag = shortTag(node.tagName);
|
||||
|
||||
const targetSuffix = isTarget && idx >= 0 ? ":" + idx : "";
|
||||
const parentSuffix = !isTarget && count > 1 && idx >= 0 ? "@" + idx : "";
|
||||
const sibSuffix = targetSuffix || parentSuffix;
|
||||
|
||||
let cls = [...node.classList].filter(c => !isNoiseClass(c));
|
||||
if (cls.length > 2) cls = cls.slice(0, 2);
|
||||
if (!cls.length) cls = ["c0"];
|
||||
|
||||
const attrs = [];
|
||||
|
||||
const id = node.id;
|
||||
if (id && !isDynamicId(id)) attrs.push("#" + id);
|
||||
|
||||
const attrNames = node.getAttributeNames ? node.getAttributeNames() : [];
|
||||
let hasDataAttr = false;
|
||||
for (const a of attrNames) {
|
||||
if (a.startsWith("data-")) {
|
||||
attrs.push("[" + a + "=" + node.getAttribute(a) + "]");
|
||||
hasDataAttr = true;
|
||||
}
|
||||
}
|
||||
|
||||
const role = node.getAttribute ? node.getAttribute("role") : null;
|
||||
if (role) attrs.push("[r=" + role + "]");
|
||||
|
||||
const chunk = tag + "." + cls.join(".") + attrs.join("") + (sibSuffix ? sibSuffix : "");
|
||||
parts.unshift(chunk);
|
||||
|
||||
depth += 1;
|
||||
|
||||
const hasStrongAnchor =
|
||||
(id && !isDynamicId(id)) || hasDataAttr || !!role;
|
||||
|
||||
if (depth >= 5 || hasStrongAnchor) break;
|
||||
|
||||
node = node.parentNode;
|
||||
}
|
||||
|
||||
return parts.join(">");
|
||||
};
|
||||
|
||||
const items = [];
|
||||
const seen = new Map();
|
||||
|
||||
const addItem = o => {
|
||||
const keyParts = [o.el, o.x];
|
||||
if (o.t) keyParts.push("t=" + o.t);
|
||||
if (o.l) keyParts.push("l=" + o.l);
|
||||
if (o.p) keyParts.push("p=" + o.p);
|
||||
if (o.n) keyParts.push("n=" + o.n);
|
||||
if (o.i) keyParts.push("i=" + o.i);
|
||||
if (o.d) keyParts.push("d=" + o.d);
|
||||
if (o.r) keyParts.push("r=" + o.r);
|
||||
const key = keyParts.join("|");
|
||||
const prev = seen.get(key) || 0;
|
||||
if (prev > 0) {
|
||||
let h = 0;
|
||||
const str = key + "#" + prev;
|
||||
for (let i = 0; i < str.length; i++) {
|
||||
h = (h * 31 + str.charCodeAt(i)) >>> 0;
|
||||
}
|
||||
const hex = (h & 0xfff).toString(16).padStart(3, "0");
|
||||
o.h = hex;
|
||||
}
|
||||
seen.set(key, prev + 1);
|
||||
items.push(o);
|
||||
};
|
||||
|
||||
const elements = [...document.querySelectorAll("button,a,input,select,textarea")];
|
||||
|
||||
for (const e of elements) {
|
||||
const t = clean(e.innerText);
|
||||
const l = clean(e.getAttribute("aria-label"));
|
||||
const p = clean(e.getAttribute("placeholder"));
|
||||
const n = e.getAttribute("name");
|
||||
const r = e.getAttribute("role");
|
||||
const id = e.id;
|
||||
const stableId = isDynamicId(id) ? null : id;
|
||||
const d = e.getAttribute("data-testid");
|
||||
|
||||
// skip menuitems with no meaningful text/label/placeholder
|
||||
if (r === "menuitem" && !t && !l && !p) continue;
|
||||
|
||||
// keep only meaningful ones
|
||||
if (!(t || l || p || n || stableId || d || r)) continue;
|
||||
|
||||
const o = { el: shortTag(e.tagName), x: getSemSiblingPath(e) };
|
||||
|
||||
if (t) o.t = t;
|
||||
if (l && l !== t) o.l = l;
|
||||
if (p && p !== t && p !== l) o.p = p;
|
||||
if (n) o.n = n;
|
||||
if (stableId) o.i = stableId;
|
||||
if (d) o.d = d;
|
||||
if (r) o.r = r;
|
||||
|
||||
addItem(o);
|
||||
}
|
||||
|
||||
const json = JSON.stringify(items, null, 2);
|
||||
console.log("chars:", json.length);
|
||||
console.log("elements:", items.length);
|
||||
console.log(items);
|
||||
|
||||
return items;
|
||||
})();`;
|
||||
|
||||
const domExtractor = `() => {
|
||||
const MAX_TEXT = 60;
|
||||
|
||||
const clean = t =>
|
||||
t ? t.replace(/\\s+/g, " ").trim().slice(0, MAX_TEXT) : null;
|
||||
|
||||
const isDynamicId = id =>
|
||||
id && (id.includes(":-") || /:[a-z0-9]+:/i.test(id));
|
||||
|
||||
const shortTag = t => ({
|
||||
BUTTON: "bu",
|
||||
A: "a",
|
||||
INPUT: "in",
|
||||
SELECT: "s",
|
||||
TEXTAREA: "ta",
|
||||
DIV: "d",
|
||||
SPAN: "sp"
|
||||
}[t] || t.toLowerCase());
|
||||
|
||||
const isNoiseClass = c =>
|
||||
!c ||
|
||||
c.length < 3 ||
|
||||
/^css-/.test(c) ||
|
||||
/^[a-z0-9]{6,}$/i.test(c) ||
|
||||
/^\\w{1,3}-\\w{4,}$/.test(c);
|
||||
|
||||
const siblingIndex = node => {
|
||||
const sib = [...node.parentNode.children]
|
||||
.filter(n => n.tagName === node.tagName);
|
||||
return { idx: sib.indexOf(node), count: sib.length };
|
||||
};
|
||||
|
||||
const getSemSiblingPath = el => {
|
||||
const parts = [];
|
||||
let node = el;
|
||||
let depth = 0;
|
||||
|
||||
while (node && node.nodeType === 1 && node !== document.body && depth < 5) {
|
||||
const { idx, count } = siblingIndex(node);
|
||||
const isTarget = node === el;
|
||||
const tag = shortTag(node.tagName);
|
||||
|
||||
const targetSuffix = isTarget && idx >= 0 ? ":" + idx : "";
|
||||
const parentSuffix = !isTarget && count > 1 && idx >= 0 ? "@" + idx : "";
|
||||
const sibSuffix = targetSuffix || parentSuffix;
|
||||
|
||||
let cls = [...node.classList].filter(c => !isNoiseClass(c));
|
||||
if (cls.length > 2) cls = cls.slice(0, 2);
|
||||
if (!cls.length) cls = ["c0"];
|
||||
|
||||
const attrs = [];
|
||||
|
||||
const id = node.id;
|
||||
if (id && !isDynamicId(id)) attrs.push("#" + id);
|
||||
|
||||
const attrNames = node.getAttributeNames ? node.getAttributeNames() : [];
|
||||
let hasDataAttr = false;
|
||||
for (const a of attrNames) {
|
||||
if (a.startsWith("data-")) {
|
||||
attrs.push("[" + a + "=" + node.getAttribute(a) + "]");
|
||||
hasDataAttr = true;
|
||||
}
|
||||
}
|
||||
|
||||
const role = node.getAttribute ? node.getAttribute("role") : null;
|
||||
if (role) attrs.push("[r=" + role + "]");
|
||||
|
||||
const chunk = tag + "." + cls.join(".") + attrs.join("") + (sibSuffix ? sibSuffix : "");
|
||||
parts.unshift(chunk);
|
||||
|
||||
depth += 1;
|
||||
|
||||
const hasStrongAnchor =
|
||||
(id && !isDynamicId(id)) || hasDataAttr || !!role;
|
||||
|
||||
if (depth >= 5 || hasStrongAnchor) break;
|
||||
|
||||
node = node.parentNode;
|
||||
}
|
||||
|
||||
return parts.join(">");
|
||||
};
|
||||
|
||||
const items = [];
|
||||
const seen = new Map();
|
||||
|
||||
const addItem = o => {
|
||||
const keyParts = [o.el, o.x];
|
||||
if (o.t) keyParts.push("t=" + o.t);
|
||||
if (o.l) keyParts.push("l=" + o.l);
|
||||
if (o.p) keyParts.push("p=" + o.p);
|
||||
if (o.n) keyParts.push("n=" + o.n);
|
||||
if (o.i) keyParts.push("i=" + o.i);
|
||||
if (o.d) keyParts.push("d=" + o.d);
|
||||
if (o.r) keyParts.push("r=" + o.r);
|
||||
const key = keyParts.join("|");
|
||||
const prev = seen.get(key) || 0;
|
||||
if (prev > 0) {
|
||||
let h = 0;
|
||||
const str = key + "#" + prev;
|
||||
for (let i = 0; i < str.length; i++) {
|
||||
h = (h * 31 + str.charCodeAt(i)) >>> 0;
|
||||
}
|
||||
const hex = (h & 0xfff).toString(16).padStart(3, "0");
|
||||
o.h = hex;
|
||||
}
|
||||
seen.set(key, prev + 1);
|
||||
items.push(o);
|
||||
};
|
||||
|
||||
const elements = [...document.querySelectorAll("button,a,input,select,textarea")];
|
||||
|
||||
for (const e of elements) {
|
||||
const t = clean(e.innerText);
|
||||
const l = clean(e.getAttribute("aria-label"));
|
||||
const p = clean(e.getAttribute("placeholder"));
|
||||
const n = e.getAttribute("name");
|
||||
const r = e.getAttribute("role");
|
||||
const id = e.id;
|
||||
const stableId = isDynamicId(id) ? null : id;
|
||||
const d = e.getAttribute("data-testid");
|
||||
|
||||
// skip menuitems with no meaningful text/label/placeholder
|
||||
if (r === "menuitem" && !t && !l && !p) continue;
|
||||
|
||||
if (!(t || l || p || n || stableId || d || r)) continue;
|
||||
|
||||
const o = { el: shortTag(e.tagName), x: getSemSiblingPath(e) };
|
||||
|
||||
if (t) o.t = t;
|
||||
if (l && l !== t) o.l = l;
|
||||
if (p && p !== t && p !== l) o.p = p;
|
||||
if (n) o.n = n;
|
||||
if (stableId) o.i = stableId;
|
||||
if (d) o.d = d;
|
||||
if (r) o.r = r;
|
||||
|
||||
addItem(o);
|
||||
}
|
||||
|
||||
return items;
|
||||
}`;
|
||||
|
||||
module.exports = { domExportScript };
|
||||
|
||||
async function ensureDir(dir: string) {
|
||||
try {
|
||||
await fs.mkdir(dir, { recursive: true });
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
|
||||
async function exportAll() {
|
||||
await ensureDir(OUTPUT_DIR);
|
||||
|
||||
async function collectHtmlFiles(dir: string): Promise<string[]> {
|
||||
const entries = (await fs.readdir(dir, { withFileTypes: true })) as any[];
|
||||
const results: string[] = [];
|
||||
for (const ent of entries) {
|
||||
const p = path.join(dir, ent.name);
|
||||
if (ent.isDirectory()) {
|
||||
results.push(...(await collectHtmlFiles(p)));
|
||||
} else if (ent.isFile() && ent.name.endsWith(".html")) {
|
||||
results.push(path.relative(INPUT_DIR, p));
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
let htmlFiles: string[] = [];
|
||||
try {
|
||||
htmlFiles = await collectHtmlFiles(INPUT_DIR);
|
||||
} catch (err) {
|
||||
console.error(
|
||||
"Could not read input directory recursively:",
|
||||
INPUT_DIR,
|
||||
err
|
||||
);
|
||||
process.exit(1);
|
||||
return;
|
||||
}
|
||||
|
||||
if (htmlFiles.length === 0) {
|
||||
console.log("No .html files found in", INPUT_DIR);
|
||||
return;
|
||||
}
|
||||
|
||||
const browser = await chromium.launch({ headless: true });
|
||||
try {
|
||||
for (const file of htmlFiles) {
|
||||
const abs = path.join(INPUT_DIR, file);
|
||||
const url = "file://" + abs;
|
||||
const page = await browser.newPage();
|
||||
try {
|
||||
await page.goto(url, { waitUntil: "domcontentloaded", timeout: 10000 });
|
||||
const items = await page.evaluate(
|
||||
new Function("return (" + domExtractor + ")()") as any
|
||||
);
|
||||
const outPath = path.join(OUTPUT_DIR, file.replace(/\.html$/, ".json"));
|
||||
await fs.mkdir(path.dirname(outPath), { recursive: true });
|
||||
await fs.writeFile(outPath, JSON.stringify(items, null, 2), "utf8");
|
||||
console.log(
|
||||
"exported " +
|
||||
file +
|
||||
" -> " +
|
||||
path.relative(process.cwd(), outPath) +
|
||||
" (elements: " +
|
||||
(Array.isArray(items) ? items.length : 0) +
|
||||
")"
|
||||
);
|
||||
} catch (e) {
|
||||
console.error("Failed processing", file, e);
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
exportAll().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
Reference in New Issue
Block a user