404 lines
12 KiB
TypeScript
404 lines
12 KiB
TypeScript
// Developer-only scripts moved out of infrastructure: DOM exporter for local HTML dumps.
|
|
// NOT for production automation; intended as a developer utility to generate compact DOM exports
|
|
// for manual inspection and to aid writing Playwright automations.
|
|
//
|
|
// Usage (from repo root):
|
|
// npm install -D playwright ts-node typescript @types/node
|
|
// npx playwright install
|
|
// npx ts-node scripts/dom-export/exportHtmlDumps.ts
|
|
//
|
|
// Output: ./html-dumps-optimized/*.json
|
|
//
|
|
// This file intentionally contains both the in-page extractor string (exported) and the
|
|
// Playwright runner that iterates ./html-dumps/*.html and writes .json files into
|
|
// ./html-dumps-optimized. These artifacts are developer helpers and must not be imported
|
|
// into production automation code.
|
|
|
|
const { chromium } = require("playwright");
|
|
const fs = require("fs").promises;
|
|
const path = require("path");
|
|
|
|
const INPUT_DIR = path.join(process.cwd(), "html-dumps");
|
|
const OUTPUT_DIR = path.join(process.cwd(), "html-dumps-optimized");
|
|
|
|
// Developer helper: in-page DOM extractor string (for console or page.evaluate).
|
|
// Kept as a plain const to avoid ES module import/export issues when running with ts-node in CJS mode.
|
|
// This version compresses output aggressively using short tags/fields and a semantic short DOM path.
|
|
const domExportScript = `(() => {
|
|
const MAX_TEXT = 60;
|
|
|
|
const clean = t =>
|
|
t ? t.replace(/\\s+/g, " ").trim().slice(0, MAX_TEXT) : null;
|
|
|
|
const isDynamicId = id =>
|
|
id && (id.includes(":-") || /:[a-z0-9]+:/i.test(id));
|
|
|
|
const shortTag = t => ({
|
|
BUTTON: "bu",
|
|
A: "a",
|
|
INPUT: "in",
|
|
SELECT: "s",
|
|
TEXTAREA: "ta",
|
|
DIV: "d",
|
|
SPAN: "sp"
|
|
}[t] || t.toLowerCase());
|
|
|
|
const isNoiseClass = c =>
|
|
!c ||
|
|
c.length < 3 ||
|
|
/^css-/.test(c) ||
|
|
/^[a-z0-9]{6,}$/i.test(c) ||
|
|
/^\\w{1,3}-\\w{4,}$/.test(c);
|
|
|
|
const siblingIndex = node => {
|
|
const sib = [...node.parentNode.children]
|
|
.filter(n => n.tagName === node.tagName);
|
|
return { idx: sib.indexOf(node), count: sib.length };
|
|
};
|
|
|
|
const getSemSiblingPath = el => {
|
|
const parts = [];
|
|
let node = el;
|
|
let depth = 0;
|
|
|
|
while (node && node.nodeType === 1 && node !== document.body && depth < 5) {
|
|
const { idx, count } = siblingIndex(node);
|
|
const isTarget = node === el;
|
|
const tag = shortTag(node.tagName);
|
|
|
|
const targetSuffix = isTarget && idx >= 0 ? ":" + idx : "";
|
|
const parentSuffix = !isTarget && count > 1 && idx >= 0 ? "@" + idx : "";
|
|
const sibSuffix = targetSuffix || parentSuffix;
|
|
|
|
let cls = [...node.classList].filter(c => !isNoiseClass(c));
|
|
if (cls.length > 2) cls = cls.slice(0, 2);
|
|
if (!cls.length) cls = ["c0"];
|
|
|
|
const attrs = [];
|
|
|
|
const id = node.id;
|
|
if (id && !isDynamicId(id)) attrs.push("#" + id);
|
|
|
|
const attrNames = node.getAttributeNames ? node.getAttributeNames() : [];
|
|
let hasDataAttr = false;
|
|
for (const a of attrNames) {
|
|
if (a.startsWith("data-")) {
|
|
attrs.push("[" + a + "=" + node.getAttribute(a) + "]");
|
|
hasDataAttr = true;
|
|
}
|
|
}
|
|
|
|
const role = node.getAttribute ? node.getAttribute("role") : null;
|
|
if (role) attrs.push("[r=" + role + "]");
|
|
|
|
const chunk = tag + "." + cls.join(".") + attrs.join("") + (sibSuffix ? sibSuffix : "");
|
|
parts.unshift(chunk);
|
|
|
|
depth += 1;
|
|
|
|
const hasStrongAnchor =
|
|
(id && !isDynamicId(id)) || hasDataAttr || !!role;
|
|
|
|
if (depth >= 5 || hasStrongAnchor) break;
|
|
|
|
node = node.parentNode;
|
|
}
|
|
|
|
return parts.join(">");
|
|
};
|
|
|
|
const items = [];
|
|
const seen = new Map();
|
|
|
|
const addItem = o => {
|
|
const keyParts = [o.el, o.x];
|
|
if (o.t) keyParts.push("t=" + o.t);
|
|
if (o.l) keyParts.push("l=" + o.l);
|
|
if (o.p) keyParts.push("p=" + o.p);
|
|
if (o.n) keyParts.push("n=" + o.n);
|
|
if (o.i) keyParts.push("i=" + o.i);
|
|
if (o.d) keyParts.push("d=" + o.d);
|
|
if (o.r) keyParts.push("r=" + o.r);
|
|
const key = keyParts.join("|");
|
|
const prev = seen.get(key) || 0;
|
|
if (prev > 0) {
|
|
let h = 0;
|
|
const str = key + "#" + prev;
|
|
for (let i = 0; i < str.length; i++) {
|
|
h = (h * 31 + str.charCodeAt(i)) >>> 0;
|
|
}
|
|
const hex = (h & 0xfff).toString(16).padStart(3, "0");
|
|
o.h = hex;
|
|
}
|
|
seen.set(key, prev + 1);
|
|
items.push(o);
|
|
};
|
|
|
|
const elements = [...document.querySelectorAll("button,a,input,select,textarea")];
|
|
|
|
for (const e of elements) {
|
|
const t = clean(e.innerText);
|
|
const l = clean(e.getAttribute("aria-label"));
|
|
const p = clean(e.getAttribute("placeholder"));
|
|
const n = e.getAttribute("name");
|
|
const r = e.getAttribute("role");
|
|
const id = e.id;
|
|
const stableId = isDynamicId(id) ? null : id;
|
|
const d = e.getAttribute("data-testid");
|
|
|
|
// skip menuitems with no meaningful text/label/placeholder
|
|
if (r === "menuitem" && !t && !l && !p) continue;
|
|
|
|
// keep only meaningful ones
|
|
if (!(t || l || p || n || stableId || d || r)) continue;
|
|
|
|
const o = { el: shortTag(e.tagName), x: getSemSiblingPath(e) };
|
|
|
|
if (t) o.t = t;
|
|
if (l && l !== t) o.l = l;
|
|
if (p && p !== t && p !== l) o.p = p;
|
|
if (n) o.n = n;
|
|
if (stableId) o.i = stableId;
|
|
if (d) o.d = d;
|
|
if (r) o.r = r;
|
|
|
|
addItem(o);
|
|
}
|
|
|
|
const json = JSON.stringify(items, null, 2);
|
|
console.log("chars:", json.length);
|
|
console.log("elements:", items.length);
|
|
console.log(items);
|
|
|
|
return items;
|
|
})();`;
|
|
|
|
const domExtractor = `() => {
|
|
const MAX_TEXT = 60;
|
|
|
|
const clean = t =>
|
|
t ? t.replace(/\\s+/g, " ").trim().slice(0, MAX_TEXT) : null;
|
|
|
|
const isDynamicId = id =>
|
|
id && (id.includes(":-") || /:[a-z0-9]+:/i.test(id));
|
|
|
|
const shortTag = t => ({
|
|
BUTTON: "bu",
|
|
A: "a",
|
|
INPUT: "in",
|
|
SELECT: "s",
|
|
TEXTAREA: "ta",
|
|
DIV: "d",
|
|
SPAN: "sp"
|
|
}[t] || t.toLowerCase());
|
|
|
|
const isNoiseClass = c =>
|
|
!c ||
|
|
c.length < 3 ||
|
|
/^css-/.test(c) ||
|
|
/^[a-z0-9]{6,}$/i.test(c) ||
|
|
/^\\w{1,3}-\\w{4,}$/.test(c);
|
|
|
|
const siblingIndex = node => {
|
|
const sib = [...node.parentNode.children]
|
|
.filter(n => n.tagName === node.tagName);
|
|
return { idx: sib.indexOf(node), count: sib.length };
|
|
};
|
|
|
|
const getSemSiblingPath = el => {
|
|
const parts = [];
|
|
let node = el;
|
|
let depth = 0;
|
|
|
|
while (node && node.nodeType === 1 && node !== document.body && depth < 5) {
|
|
const { idx, count } = siblingIndex(node);
|
|
const isTarget = node === el;
|
|
const tag = shortTag(node.tagName);
|
|
|
|
const targetSuffix = isTarget && idx >= 0 ? ":" + idx : "";
|
|
const parentSuffix = !isTarget && count > 1 && idx >= 0 ? "@" + idx : "";
|
|
const sibSuffix = targetSuffix || parentSuffix;
|
|
|
|
let cls = [...node.classList].filter(c => !isNoiseClass(c));
|
|
if (cls.length > 2) cls = cls.slice(0, 2);
|
|
if (!cls.length) cls = ["c0"];
|
|
|
|
const attrs = [];
|
|
|
|
const id = node.id;
|
|
if (id && !isDynamicId(id)) attrs.push("#" + id);
|
|
|
|
const attrNames = node.getAttributeNames ? node.getAttributeNames() : [];
|
|
let hasDataAttr = false;
|
|
for (const a of attrNames) {
|
|
if (a.startsWith("data-")) {
|
|
attrs.push("[" + a + "=" + node.getAttribute(a) + "]");
|
|
hasDataAttr = true;
|
|
}
|
|
}
|
|
|
|
const role = node.getAttribute ? node.getAttribute("role") : null;
|
|
if (role) attrs.push("[r=" + role + "]");
|
|
|
|
const chunk = tag + "." + cls.join(".") + attrs.join("") + (sibSuffix ? sibSuffix : "");
|
|
parts.unshift(chunk);
|
|
|
|
depth += 1;
|
|
|
|
const hasStrongAnchor =
|
|
(id && !isDynamicId(id)) || hasDataAttr || !!role;
|
|
|
|
if (depth >= 5 || hasStrongAnchor) break;
|
|
|
|
node = node.parentNode;
|
|
}
|
|
|
|
return parts.join(">");
|
|
};
|
|
|
|
const items = [];
|
|
const seen = new Map();
|
|
|
|
const addItem = o => {
|
|
const keyParts = [o.el, o.x];
|
|
if (o.t) keyParts.push("t=" + o.t);
|
|
if (o.l) keyParts.push("l=" + o.l);
|
|
if (o.p) keyParts.push("p=" + o.p);
|
|
if (o.n) keyParts.push("n=" + o.n);
|
|
if (o.i) keyParts.push("i=" + o.i);
|
|
if (o.d) keyParts.push("d=" + o.d);
|
|
if (o.r) keyParts.push("r=" + o.r);
|
|
const key = keyParts.join("|");
|
|
const prev = seen.get(key) || 0;
|
|
if (prev > 0) {
|
|
let h = 0;
|
|
const str = key + "#" + prev;
|
|
for (let i = 0; i < str.length; i++) {
|
|
h = (h * 31 + str.charCodeAt(i)) >>> 0;
|
|
}
|
|
const hex = (h & 0xfff).toString(16).padStart(3, "0");
|
|
o.h = hex;
|
|
}
|
|
seen.set(key, prev + 1);
|
|
items.push(o);
|
|
};
|
|
|
|
const elements = [...document.querySelectorAll("button,a,input,select,textarea")];
|
|
|
|
for (const e of elements) {
|
|
const t = clean(e.innerText);
|
|
const l = clean(e.getAttribute("aria-label"));
|
|
const p = clean(e.getAttribute("placeholder"));
|
|
const n = e.getAttribute("name");
|
|
const r = e.getAttribute("role");
|
|
const id = e.id;
|
|
const stableId = isDynamicId(id) ? null : id;
|
|
const d = e.getAttribute("data-testid");
|
|
|
|
// skip menuitems with no meaningful text/label/placeholder
|
|
if (r === "menuitem" && !t && !l && !p) continue;
|
|
|
|
if (!(t || l || p || n || stableId || d || r)) continue;
|
|
|
|
const o = { el: shortTag(e.tagName), x: getSemSiblingPath(e) };
|
|
|
|
if (t) o.t = t;
|
|
if (l && l !== t) o.l = l;
|
|
if (p && p !== t && p !== l) o.p = p;
|
|
if (n) o.n = n;
|
|
if (stableId) o.i = stableId;
|
|
if (d) o.d = d;
|
|
if (r) o.r = r;
|
|
|
|
addItem(o);
|
|
}
|
|
|
|
return items;
|
|
}`;
|
|
|
|
module.exports = { domExportScript };
|
|
|
|
async function ensureDir(dir: string) {
|
|
try {
|
|
await fs.mkdir(dir, { recursive: true });
|
|
} catch {
|
|
// ignore
|
|
}
|
|
}
|
|
|
|
async function exportAll() {
|
|
await ensureDir(OUTPUT_DIR);
|
|
|
|
async function collectHtmlFiles(dir: string): Promise<string[]> {
|
|
const entries = (await fs.readdir(dir, { withFileTypes: true })) as any[];
|
|
const results: string[] = [];
|
|
for (const ent of entries) {
|
|
const p = path.join(dir, ent.name);
|
|
if (ent.isDirectory()) {
|
|
results.push(...(await collectHtmlFiles(p)));
|
|
} else if (ent.isFile() && ent.name.endsWith(".html")) {
|
|
results.push(path.relative(INPUT_DIR, p));
|
|
}
|
|
}
|
|
return results;
|
|
}
|
|
|
|
let htmlFiles: string[] = [];
|
|
try {
|
|
htmlFiles = await collectHtmlFiles(INPUT_DIR);
|
|
} catch (err) {
|
|
console.error(
|
|
"Could not read input directory recursively:",
|
|
INPUT_DIR,
|
|
err
|
|
);
|
|
process.exit(1);
|
|
return;
|
|
}
|
|
|
|
if (htmlFiles.length === 0) {
|
|
console.log("No .html files found in", INPUT_DIR);
|
|
return;
|
|
}
|
|
|
|
const browser = await chromium.launch({ headless: true });
|
|
try {
|
|
for (const file of htmlFiles) {
|
|
const abs = path.join(INPUT_DIR, file);
|
|
const url = "file://" + abs;
|
|
const page = await browser.newPage();
|
|
try {
|
|
await page.goto(url, { waitUntil: "domcontentloaded", timeout: 10000 });
|
|
const items = await page.evaluate(
|
|
new Function("return (" + domExtractor + ")()") as any
|
|
);
|
|
const outPath = path.join(OUTPUT_DIR, file.replace(/\.html$/, ".json"));
|
|
await fs.mkdir(path.dirname(outPath), { recursive: true });
|
|
await fs.writeFile(outPath, JSON.stringify(items, null, 2), "utf8");
|
|
console.log(
|
|
"exported " +
|
|
file +
|
|
" -> " +
|
|
path.relative(process.cwd(), outPath) +
|
|
" (elements: " +
|
|
(Array.isArray(items) ? items.length : 0) +
|
|
")"
|
|
);
|
|
} catch (e) {
|
|
console.error("Failed processing", file, e);
|
|
} finally {
|
|
await page.close();
|
|
}
|
|
}
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
}
|
|
|
|
if (require.main === module) {
|
|
exportAll().catch((err) => {
|
|
console.error(err);
|
|
process.exit(1);
|
|
});
|
|
}
|