[scraper] func: add database, docker support, ingest functions, and basic menu
This commit is contained in:
commit
47d8470372
24
UNLICENSE
Normal file
24
UNLICENSE
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
This is free and unencumbered software released into the public domain.
|
||||||
|
|
||||||
|
Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||||
|
distribute this software, either in source code form or as a compiled
|
||||||
|
binary, for any purpose, commercial or non-commercial, and by any
|
||||||
|
means.
|
||||||
|
|
||||||
|
In jurisdictions that recognize copyright laws, the author or authors
|
||||||
|
of this software dedicate any and all copyright interest in the
|
||||||
|
software to the public domain. We make this dedication for the benefit
|
||||||
|
of the public at large and to the detriment of our heirs and
|
||||||
|
successors. We intend this dedication to be an overt act of
|
||||||
|
relinquishment in perpetuity of all present and future rights to this
|
||||||
|
software under copyright law.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||||
|
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||||
|
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||||
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
For more information, please refer to <https://unlicense.org/>
|
43
scraper/.gitignore
vendored
Normal file
43
scraper/.gitignore
vendored
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
# dependencies (bun install)
|
||||||
|
node_modules
|
||||||
|
|
||||||
|
# output
|
||||||
|
out
|
||||||
|
dist
|
||||||
|
*.tgz
|
||||||
|
|
||||||
|
# code coverage
|
||||||
|
coverage
|
||||||
|
*.lcov
|
||||||
|
|
||||||
|
# logs
|
||||||
|
logs
|
||||||
|
_.log
|
||||||
|
report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
|
||||||
|
|
||||||
|
# dotenv environment variable files
|
||||||
|
.env
|
||||||
|
.env.development.local
|
||||||
|
.env.test.local
|
||||||
|
.env.production.local
|
||||||
|
.env.local
|
||||||
|
|
||||||
|
# caches
|
||||||
|
.eslintcache
|
||||||
|
.cache
|
||||||
|
*.tsbuildinfo
|
||||||
|
|
||||||
|
# IntelliJ based IDEs
|
||||||
|
.idea
|
||||||
|
|
||||||
|
# Finder (MacOS) folder config
|
||||||
|
.DS_Store
|
||||||
|
|
||||||
|
# ingest files
|
||||||
|
src/ingest/urls.txt
|
||||||
|
|
||||||
|
# postgres
|
||||||
|
postgres
|
||||||
|
|
||||||
|
# bun
|
||||||
|
bun.lock*
|
10
scraper/docker-compose.yml
Normal file
10
scraper/docker-compose.yml
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
services:
|
||||||
|
postgres:
|
||||||
|
image: postgres:17
|
||||||
|
environment:
|
||||||
|
POSTGRES_PASSWORD: abfi29239ed98q93aEa89EriiaKaye896quAirhaAu
|
||||||
|
POSTGRES_DB: scraper
|
||||||
|
ports:
|
||||||
|
- 5432:5432
|
||||||
|
volumes:
|
||||||
|
- ./postgres:/var/lib/postgresql/data
|
11
scraper/drizzle.config.ts
Normal file
11
scraper/drizzle.config.ts
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
import "dotenv/config"
|
||||||
|
import { defineConfig } from "drizzle-kit"
|
||||||
|
|
||||||
|
export default defineConfig({
|
||||||
|
out: "./drizzle",
|
||||||
|
schema: "./src/db/schema.ts",
|
||||||
|
dialect: "postgresql",
|
||||||
|
dbCredentials: {
|
||||||
|
url: process.env.DATABASE_URL!,
|
||||||
|
},
|
||||||
|
})
|
25
scraper/package.json
Normal file
25
scraper/package.json
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
{
|
||||||
|
"name": "navigate",
|
||||||
|
"module": "src/index.ts",
|
||||||
|
"type": "module",
|
||||||
|
"private": true,
|
||||||
|
"scripts": {
|
||||||
|
"start": "tsx src/index.ts"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/bun": "latest",
|
||||||
|
"@types/pg": "^8.11.13",
|
||||||
|
"drizzle-kit": "^0.31.0",
|
||||||
|
"tsx": "^4.19.3"
|
||||||
|
},
|
||||||
|
"peerDependencies": {
|
||||||
|
"typescript": "^5"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"axios": "^1.8.4",
|
||||||
|
"cheerio": "^1.0.0",
|
||||||
|
"dotenv": "^16.5.0",
|
||||||
|
"drizzle-orm": "^0.42.0",
|
||||||
|
"pg": "^8.15.5"
|
||||||
|
}
|
||||||
|
}
|
18
scraper/src/db/schema.ts
Normal file
18
scraper/src/db/schema.ts
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
import { integer, pgTable, varchar, timestamp } from "drizzle-orm/pg-core"
|
||||||
|
|
||||||
|
export const scrapeQueue = pgTable("scrape_queue", {
|
||||||
|
id: integer().primaryKey().generatedAlwaysAsIdentity(),
|
||||||
|
url: varchar({ length: 255 }).notNull(),
|
||||||
|
status: varchar({ length: 255 }).notNull(),
|
||||||
|
createdAt: timestamp().notNull().defaultNow(),
|
||||||
|
updatedAt: timestamp().notNull().defaultNow(),
|
||||||
|
})
|
||||||
|
|
||||||
|
export const searchData = pgTable("search_data", {
|
||||||
|
id: integer().primaryKey().generatedAlwaysAsIdentity(),
|
||||||
|
url: varchar({ length: 255 }).notNull(),
|
||||||
|
title: varchar({ length: 255 }).notNull(),
|
||||||
|
description: varchar({ length: 255 }).notNull(),
|
||||||
|
createdAt: timestamp().notNull().defaultNow(),
|
||||||
|
updatedAt: timestamp().notNull().defaultNow(),
|
||||||
|
})
|
60
scraper/src/index.ts
Normal file
60
scraper/src/index.ts
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
import { checkIngest } from "./ingest/ingest"
|
||||||
|
import { clearScreen, truncate } from "./util/osFunctions"
|
||||||
|
import getRandomUrl from "./util/toScrape"
|
||||||
|
import * as readline from "readline"
|
||||||
|
|
||||||
|
const rl = readline.createInterface({
|
||||||
|
input: process.stdin,
|
||||||
|
output: process.stdout
|
||||||
|
})
|
||||||
|
|
||||||
|
function promptUser(question: string): Promise<string> {
|
||||||
|
return new Promise((resolve) => {
|
||||||
|
rl.question(question, (answer) => {
|
||||||
|
resolve(answer)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
checkIngest()
|
||||||
|
console.log()
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
while (true) {
|
||||||
|
const url = await getRandomUrl()
|
||||||
|
if (!url) {
|
||||||
|
console.log("No URLs to scrape")
|
||||||
|
rl.close()
|
||||||
|
process.exit(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
clearScreen()
|
||||||
|
|
||||||
|
console.log("┌───────────────────────────────────────────────┐")
|
||||||
|
console.log("│ NAVIGATE SCRAPER │")
|
||||||
|
console.log("├───────────────────────────────────────────────┤")
|
||||||
|
console.log(`│ URL: ${truncate(url, { length: 40 })}... │`)
|
||||||
|
console.log("┢━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┪")
|
||||||
|
console.log("┃ [S]crape ┃ [Q]uit ┃")
|
||||||
|
console.log("┗━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━┛\n")
|
||||||
|
|
||||||
|
const input = await promptUser("> ")
|
||||||
|
if (input === "s") {
|
||||||
|
console.log("I would scrape now...")
|
||||||
|
} else if (input === "q") {
|
||||||
|
clearScreen()
|
||||||
|
console.log("\nExiting...\n")
|
||||||
|
rl.close()
|
||||||
|
process.exit(0)
|
||||||
|
} else {
|
||||||
|
clearScreen()
|
||||||
|
console.log("Invalid input. Please enter 's' to scrape or 'q' to quit.\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(err => {
|
||||||
|
console.error("[!] Error:", err)
|
||||||
|
rl.close()
|
||||||
|
process.exit(1)
|
||||||
|
})
|
66
scraper/src/ingest/ingest.ts
Normal file
66
scraper/src/ingest/ingest.ts
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
import fs from "fs"
|
||||||
|
import path from "path"
|
||||||
|
import { fileURLToPath } from "url"
|
||||||
|
import { drizzle } from "drizzle-orm/node-postgres"
|
||||||
|
import { scrapeQueue } from "../db/schema"
|
||||||
|
import { NodePgDatabase } from "drizzle-orm/node-postgres"
|
||||||
|
import * as schema from "../db/schema"
|
||||||
|
|
||||||
|
export function checkIngestCount() {
|
||||||
|
const urls = fs.readFileSync(path.join(path.dirname(fileURLToPath(import.meta.url)), "urls.txt"), "utf8")
|
||||||
|
const urlArray = urls.split("\n")
|
||||||
|
|
||||||
|
// Cleanup
|
||||||
|
const blankLinesRemoved = urlArray.filter((url) => url.trim() !== "")
|
||||||
|
const duplicatesRemoved = blankLinesRemoved.filter((url, index, self) => self.indexOf(url) === index)
|
||||||
|
|
||||||
|
return duplicatesRemoved.length
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function ingestUrls() {
|
||||||
|
const urls = fs.readFileSync(path.join(path.dirname(fileURLToPath(import.meta.url)), "urls.txt"), "utf8")
|
||||||
|
let urlArray = urls.split("\n")
|
||||||
|
|
||||||
|
// Cleanup
|
||||||
|
const blankLinesRemoved = urlArray.filter((url) => url.trim() !== "")
|
||||||
|
const duplicatesRemoved = blankLinesRemoved.filter((url, index, self) => self.indexOf(url) === index)
|
||||||
|
|
||||||
|
// Ingest
|
||||||
|
const db = drizzle(process.env.DATABASE_URL!) as NodePgDatabase<typeof schema>
|
||||||
|
|
||||||
|
let successCt = 0
|
||||||
|
let failCt = 0
|
||||||
|
|
||||||
|
for (const url of duplicatesRemoved) {
|
||||||
|
try {
|
||||||
|
await db.insert(scrapeQueue).values({
|
||||||
|
url,
|
||||||
|
status: "pending",
|
||||||
|
})
|
||||||
|
|
||||||
|
urlArray = urlArray.filter((u) => u !== url)
|
||||||
|
|
||||||
|
successCt++
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Failed to ingest: ${url} | ${error}`)
|
||||||
|
failCt++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fs.writeFileSync(path.join(path.dirname(fileURLToPath(import.meta.url)), "urls.txt"), urlArray.join("\n"))
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: successCt,
|
||||||
|
failure: failCt,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function checkIngest() {
|
||||||
|
if (checkIngestCount() === 0) {
|
||||||
|
console.log("[i] No URLs to ingest")
|
||||||
|
} else {
|
||||||
|
console.log(`[i] Ingesting ${checkIngestCount()} URLs`)
|
||||||
|
const { success, failure } = await ingestUrls()
|
||||||
|
console.log(`[✓] Ingested ${success} URLs, failed to ingest ${failure} URLs`)
|
||||||
|
}
|
||||||
|
}
|
16
scraper/src/util/osFunctions.ts
Normal file
16
scraper/src/util/osFunctions.ts
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
import { exec } from "child_process"
|
||||||
|
|
||||||
|
export function clearScreen() {
|
||||||
|
const os = process.platform
|
||||||
|
|
||||||
|
if (os === "win32") {
|
||||||
|
exec("cls")
|
||||||
|
} else {
|
||||||
|
exec("clear")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function truncate(str: string, { length }: { length: number }): string {
|
||||||
|
if (str.length <= length) return str;
|
||||||
|
return str.slice(0, length) + "...";
|
||||||
|
}
|
28
scraper/src/util/toScrape.ts
Normal file
28
scraper/src/util/toScrape.ts
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
import "dotenv/config"
|
||||||
|
import { drizzle } from "drizzle-orm/node-postgres"
|
||||||
|
import { scrapeQueue } from "../db/schema"
|
||||||
|
import { eq, asc } from "drizzle-orm"
|
||||||
|
import { NodePgDatabase } from "drizzle-orm/node-postgres"
|
||||||
|
import * as schema from "../db/schema"
|
||||||
|
import { Pool } from "pg"
|
||||||
|
|
||||||
|
const pool = new Pool({
|
||||||
|
connectionString: process.env.DATABASE_URL,
|
||||||
|
})
|
||||||
|
|
||||||
|
const db = drizzle(pool, { schema }) as NodePgDatabase<typeof schema>
|
||||||
|
|
||||||
|
async function getRandomUrl() {
|
||||||
|
const url = await db.query.scrapeQueue.findFirst({
|
||||||
|
where: eq(scrapeQueue.status, "pending"),
|
||||||
|
orderBy: [asc(scrapeQueue.createdAt)],
|
||||||
|
})
|
||||||
|
|
||||||
|
if (!url) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
return url.url
|
||||||
|
}
|
||||||
|
|
||||||
|
export default getRandomUrl
|
28
scraper/tsconfig.json
Normal file
28
scraper/tsconfig.json
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
// Environment setup & latest features
|
||||||
|
"lib": ["ESNext"],
|
||||||
|
"target": "ESNext",
|
||||||
|
"module": "ESNext",
|
||||||
|
"moduleDetection": "force",
|
||||||
|
"jsx": "react-jsx",
|
||||||
|
"allowJs": true,
|
||||||
|
|
||||||
|
// Bundler mode
|
||||||
|
"moduleResolution": "bundler",
|
||||||
|
"allowImportingTsExtensions": true,
|
||||||
|
"verbatimModuleSyntax": true,
|
||||||
|
"noEmit": true,
|
||||||
|
|
||||||
|
// Best practices
|
||||||
|
"strict": true,
|
||||||
|
"skipLibCheck": true,
|
||||||
|
"noFallthroughCasesInSwitch": true,
|
||||||
|
"noUncheckedIndexedAccess": true,
|
||||||
|
|
||||||
|
// Some stricter flags (disabled by default)
|
||||||
|
"noUnusedLocals": false,
|
||||||
|
"noUnusedParameters": false,
|
||||||
|
"noPropertyAccessFromIndexSignature": false
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user