[scraper] func: add database, docker support, ingest functions, and basic menu

This commit is contained in:
Aidan 2025-04-23 20:19:47 -04:00
commit 47d8470372
12 changed files with 330 additions and 0 deletions

1
README.md Normal file
View File

@ -0,0 +1 @@
# navigate

24
UNLICENSE Normal file
View File

@ -0,0 +1,24 @@
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <https://unlicense.org/>

43
scraper/.gitignore vendored Normal file
View File

@ -0,0 +1,43 @@
# dependencies (bun install)
node_modules
# output
out
dist
*.tgz
# code coverage
coverage
*.lcov
# logs
logs
_.log
report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
# dotenv environment variable files
.env
.env.development.local
.env.test.local
.env.production.local
.env.local
# caches
.eslintcache
.cache
*.tsbuildinfo
# IntelliJ based IDEs
.idea
# Finder (MacOS) folder config
.DS_Store
# ingest files
src/ingest/urls.txt
# postgres
postgres
# bun
bun.lock*

View File

@ -0,0 +1,10 @@
services:
postgres:
image: postgres:17
environment:
POSTGRES_PASSWORD: abfi29239ed98q93aEa89EriiaKaye896quAirhaAu
POSTGRES_DB: scraper
ports:
- 5432:5432
volumes:
- ./postgres:/var/lib/postgresql/data

11
scraper/drizzle.config.ts Normal file
View File

@ -0,0 +1,11 @@
import "dotenv/config"
import { defineConfig } from "drizzle-kit"
export default defineConfig({
out: "./drizzle",
schema: "./src/db/schema.ts",
dialect: "postgresql",
dbCredentials: {
url: process.env.DATABASE_URL!,
},
})

25
scraper/package.json Normal file
View File

@ -0,0 +1,25 @@
{
"name": "navigate",
"module": "src/index.ts",
"type": "module",
"private": true,
"scripts": {
"start": "tsx src/index.ts"
},
"devDependencies": {
"@types/bun": "latest",
"@types/pg": "^8.11.13",
"drizzle-kit": "^0.31.0",
"tsx": "^4.19.3"
},
"peerDependencies": {
"typescript": "^5"
},
"dependencies": {
"axios": "^1.8.4",
"cheerio": "^1.0.0",
"dotenv": "^16.5.0",
"drizzle-orm": "^0.42.0",
"pg": "^8.15.5"
}
}

18
scraper/src/db/schema.ts Normal file
View File

@ -0,0 +1,18 @@
import { integer, pgTable, varchar, timestamp } from "drizzle-orm/pg-core"
export const scrapeQueue = pgTable("scrape_queue", {
id: integer().primaryKey().generatedAlwaysAsIdentity(),
url: varchar({ length: 255 }).notNull(),
status: varchar({ length: 255 }).notNull(),
createdAt: timestamp().notNull().defaultNow(),
updatedAt: timestamp().notNull().defaultNow(),
})
export const searchData = pgTable("search_data", {
id: integer().primaryKey().generatedAlwaysAsIdentity(),
url: varchar({ length: 255 }).notNull(),
title: varchar({ length: 255 }).notNull(),
description: varchar({ length: 255 }).notNull(),
createdAt: timestamp().notNull().defaultNow(),
updatedAt: timestamp().notNull().defaultNow(),
})

60
scraper/src/index.ts Normal file
View File

@ -0,0 +1,60 @@
import { checkIngest } from "./ingest/ingest"
import { clearScreen, truncate } from "./util/osFunctions"
import getRandomUrl from "./util/toScrape"
import * as readline from "readline"
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout
})
function promptUser(question: string): Promise<string> {
return new Promise((resolve) => {
rl.question(question, (answer) => {
resolve(answer)
})
})
}
checkIngest()
console.log()
async function main() {
while (true) {
const url = await getRandomUrl()
if (!url) {
console.log("No URLs to scrape")
rl.close()
process.exit(0)
}
clearScreen()
console.log("┌───────────────────────────────────────────────┐")
console.log("│ NAVIGATE SCRAPER │")
console.log("├───────────────────────────────────────────────┤")
console.log(`│ URL: ${truncate(url, { length: 40 })}... │`)
console.log("┢━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┪")
console.log("┃ [S]crape ┃ [Q]uit ┃")
console.log("┗━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━┛\n")
const input = await promptUser("> ")
if (input === "s") {
console.log("I would scrape now...")
} else if (input === "q") {
clearScreen()
console.log("\nExiting...\n")
rl.close()
process.exit(0)
} else {
clearScreen()
console.log("Invalid input. Please enter 's' to scrape or 'q' to quit.\n")
}
}
}
main().catch(err => {
console.error("[!] Error:", err)
rl.close()
process.exit(1)
})

View File

@ -0,0 +1,66 @@
import fs from "fs"
import path from "path"
import { fileURLToPath } from "url"
import { drizzle } from "drizzle-orm/node-postgres"
import { scrapeQueue } from "../db/schema"
import { NodePgDatabase } from "drizzle-orm/node-postgres"
import * as schema from "../db/schema"
export function checkIngestCount() {
const urls = fs.readFileSync(path.join(path.dirname(fileURLToPath(import.meta.url)), "urls.txt"), "utf8")
const urlArray = urls.split("\n")
// Cleanup
const blankLinesRemoved = urlArray.filter((url) => url.trim() !== "")
const duplicatesRemoved = blankLinesRemoved.filter((url, index, self) => self.indexOf(url) === index)
return duplicatesRemoved.length
}
export async function ingestUrls() {
const urls = fs.readFileSync(path.join(path.dirname(fileURLToPath(import.meta.url)), "urls.txt"), "utf8")
let urlArray = urls.split("\n")
// Cleanup
const blankLinesRemoved = urlArray.filter((url) => url.trim() !== "")
const duplicatesRemoved = blankLinesRemoved.filter((url, index, self) => self.indexOf(url) === index)
// Ingest
const db = drizzle(process.env.DATABASE_URL!) as NodePgDatabase<typeof schema>
let successCt = 0
let failCt = 0
for (const url of duplicatesRemoved) {
try {
await db.insert(scrapeQueue).values({
url,
status: "pending",
})
urlArray = urlArray.filter((u) => u !== url)
successCt++
} catch (error) {
console.error(`Failed to ingest: ${url} | ${error}`)
failCt++
}
}
fs.writeFileSync(path.join(path.dirname(fileURLToPath(import.meta.url)), "urls.txt"), urlArray.join("\n"))
return {
success: successCt,
failure: failCt,
}
}
export async function checkIngest() {
if (checkIngestCount() === 0) {
console.log("[i] No URLs to ingest")
} else {
console.log(`[i] Ingesting ${checkIngestCount()} URLs`)
const { success, failure } = await ingestUrls()
console.log(`[✓] Ingested ${success} URLs, failed to ingest ${failure} URLs`)
}
}

View File

@ -0,0 +1,16 @@
import { exec } from "child_process"
export function clearScreen() {
const os = process.platform
if (os === "win32") {
exec("cls")
} else {
exec("clear")
}
}
export function truncate(str: string, { length }: { length: number }): string {
if (str.length <= length) return str;
return str.slice(0, length) + "...";
}

View File

@ -0,0 +1,28 @@
import "dotenv/config"
import { drizzle } from "drizzle-orm/node-postgres"
import { scrapeQueue } from "../db/schema"
import { eq, asc } from "drizzle-orm"
import { NodePgDatabase } from "drizzle-orm/node-postgres"
import * as schema from "../db/schema"
import { Pool } from "pg"
const pool = new Pool({
connectionString: process.env.DATABASE_URL,
})
const db = drizzle(pool, { schema }) as NodePgDatabase<typeof schema>
async function getRandomUrl() {
const url = await db.query.scrapeQueue.findFirst({
where: eq(scrapeQueue.status, "pending"),
orderBy: [asc(scrapeQueue.createdAt)],
})
if (!url) {
return null
}
return url.url
}
export default getRandomUrl

28
scraper/tsconfig.json Normal file
View File

@ -0,0 +1,28 @@
{
"compilerOptions": {
// Environment setup & latest features
"lib": ["ESNext"],
"target": "ESNext",
"module": "ESNext",
"moduleDetection": "force",
"jsx": "react-jsx",
"allowJs": true,
// Bundler mode
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"verbatimModuleSyntax": true,
"noEmit": true,
// Best practices
"strict": true,
"skipLibCheck": true,
"noFallthroughCasesInSwitch": true,
"noUncheckedIndexedAccess": true,
// Some stricter flags (disabled by default)
"noUnusedLocals": false,
"noUnusedParameters": false,
"noPropertyAccessFromIndexSignature": false
}
}