feat: add extraction+saving of title, description, image, and keywords to db, add robots check before scraping
This commit is contained in:
parent
47d8470372
commit
b058ca23c0
@ -20,6 +20,7 @@
|
||||
"cheerio": "^1.0.0",
|
||||
"dotenv": "^16.5.0",
|
||||
"drizzle-orm": "^0.42.0",
|
||||
"pg": "^8.15.5"
|
||||
"pg": "^8.15.5",
|
||||
"robots-txt-parser": "^2.0.3"
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
import { integer, pgTable, varchar, timestamp } from "drizzle-orm/pg-core"
|
||||
import { integer, pgTable, varchar, timestamp, text } from "drizzle-orm/pg-core"
|
||||
|
||||
export const scrapeQueue = pgTable("scrape_queue", {
|
||||
id: integer().primaryKey().generatedAlwaysAsIdentity(),
|
||||
@ -13,6 +13,8 @@ export const searchData = pgTable("search_data", {
|
||||
url: varchar({ length: 255 }).notNull(),
|
||||
title: varchar({ length: 255 }).notNull(),
|
||||
description: varchar({ length: 255 }).notNull(),
|
||||
imageUrl: varchar({ length: 255 }).notNull(),
|
||||
keywords: text("text[]").array(),
|
||||
createdAt: timestamp().notNull().defaultNow(),
|
||||
updatedAt: timestamp().notNull().defaultNow(),
|
||||
})
|
@ -1,5 +1,7 @@
|
||||
import { checkIngest } from "./ingest/ingest"
|
||||
import checkIfScrapeAllowed from "./safety/preCheck"
|
||||
import { clearScreen, truncate } from "./util/osFunctions"
|
||||
import { saveToDatabase, scrapeUrl } from "./util/scrape"
|
||||
import getRandomUrl from "./util/toScrape"
|
||||
import * as readline from "readline"
|
||||
|
||||
@ -16,6 +18,7 @@ function promptUser(question: string): Promise<string> {
|
||||
})
|
||||
}
|
||||
|
||||
clearScreen()
|
||||
checkIngest()
|
||||
console.log()
|
||||
|
||||
@ -34,13 +37,25 @@ async function main() {
|
||||
console.log("│ NAVIGATE SCRAPER │")
|
||||
console.log("├───────────────────────────────────────────────┤")
|
||||
console.log(`│ URL: ${truncate(url, { length: 40 })}... │`)
|
||||
console.log("│ Pre-check: ", await checkIfScrapeAllowed(url) ? "Allowed" : "Blocked", " │")
|
||||
console.log("┢━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┪")
|
||||
console.log("┃ [S]crape ┃ [Q]uit ┃")
|
||||
console.log("┗━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━┛\n")
|
||||
|
||||
const input = await promptUser("> ")
|
||||
if (input === "s") {
|
||||
console.log("I would scrape now...")
|
||||
const { title, description, imageUrl, keywordsArray } = await scrapeUrl(url)
|
||||
|
||||
console.log(`\nTitle: ${title}`)
|
||||
console.log(`Description: ${description}`)
|
||||
console.log(`Image URL: ${imageUrl}`)
|
||||
console.log(`Keywords: ${keywordsArray.join(", ")}\n`)
|
||||
|
||||
console.log("Save to database? (y/n)")
|
||||
const save = await promptUser("> ")
|
||||
if (save === "y") {
|
||||
await saveToDatabase(url, title, description, imageUrl, keywordsArray)
|
||||
}
|
||||
} else if (input === "q") {
|
||||
clearScreen()
|
||||
console.log("\nExiting...\n")
|
||||
|
18
scraper/src/safety/preCheck.ts
Normal file
18
scraper/src/safety/preCheck.ts
Normal file
@ -0,0 +1,18 @@
|
||||
import robotsParser from 'robots-txt-parser';
|
||||
|
||||
const robots = robotsParser(
|
||||
{
|
||||
userAgent: 'NavigateBot',
|
||||
allowOnNeutral: false,
|
||||
},
|
||||
)
|
||||
|
||||
export default async function checkIfScrapeAllowed(url: string) {
|
||||
try {
|
||||
await robots.useRobotsFor(url)
|
||||
return robots.canCrawl(url)
|
||||
} catch (error) {
|
||||
console.error("[!]", error)
|
||||
return false
|
||||
}
|
||||
}
|
43
scraper/src/util/scrape.ts
Normal file
43
scraper/src/util/scrape.ts
Normal file
@ -0,0 +1,43 @@
|
||||
import axios from "axios"
|
||||
import "dotenv/config"
|
||||
import { NodePgDatabase } from "drizzle-orm/node-postgres"
|
||||
import { drizzle } from "drizzle-orm/node-postgres"
|
||||
import { searchData } from "../db/schema"
|
||||
import * as cheerio from "cheerio"
|
||||
import * as schema from "../db/schema"
|
||||
import { Pool } from "pg"
|
||||
|
||||
const pool = new Pool({
|
||||
connectionString: process.env.DATABASE_URL,
|
||||
})
|
||||
|
||||
const db = drizzle(pool, { schema }) as NodePgDatabase<typeof schema>
|
||||
|
||||
export async function scrapeUrl(url: string) {
|
||||
const response = await axios.get(url)
|
||||
const $ = cheerio.load(response.data)
|
||||
|
||||
// Data
|
||||
const title = $("title").text()
|
||||
const description = $("meta[name='description']").attr("content") || ""
|
||||
const imageUrl = $("meta[property='og:image']").attr("content") || ""
|
||||
const keywords = $("meta[name='keywords']").attr("content")
|
||||
|
||||
// Extract keywords
|
||||
let keywordsArray: string[] = []
|
||||
if (keywords) {
|
||||
keywordsArray = keywords.split(",")
|
||||
}
|
||||
|
||||
return { title, description, imageUrl, keywordsArray }
|
||||
}
|
||||
|
||||
export async function saveToDatabase(url: string, title: string, description: string, imageUrl: string, keywordsArray: string[]) {
|
||||
await db.insert(searchData).values({
|
||||
url,
|
||||
title,
|
||||
description,
|
||||
imageUrl,
|
||||
keywords: keywordsArray,
|
||||
})
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user