feat: add extraction+saving of title, description, image, and keywords to db, add robots check before scraping
This commit is contained in:
parent
47d8470372
commit
b058ca23c0
@ -20,6 +20,7 @@
|
|||||||
"cheerio": "^1.0.0",
|
"cheerio": "^1.0.0",
|
||||||
"dotenv": "^16.5.0",
|
"dotenv": "^16.5.0",
|
||||||
"drizzle-orm": "^0.42.0",
|
"drizzle-orm": "^0.42.0",
|
||||||
"pg": "^8.15.5"
|
"pg": "^8.15.5",
|
||||||
|
"robots-txt-parser": "^2.0.3"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { integer, pgTable, varchar, timestamp } from "drizzle-orm/pg-core"
|
import { integer, pgTable, varchar, timestamp, text } from "drizzle-orm/pg-core"
|
||||||
|
|
||||||
export const scrapeQueue = pgTable("scrape_queue", {
|
export const scrapeQueue = pgTable("scrape_queue", {
|
||||||
id: integer().primaryKey().generatedAlwaysAsIdentity(),
|
id: integer().primaryKey().generatedAlwaysAsIdentity(),
|
||||||
@ -13,6 +13,8 @@ export const searchData = pgTable("search_data", {
|
|||||||
url: varchar({ length: 255 }).notNull(),
|
url: varchar({ length: 255 }).notNull(),
|
||||||
title: varchar({ length: 255 }).notNull(),
|
title: varchar({ length: 255 }).notNull(),
|
||||||
description: varchar({ length: 255 }).notNull(),
|
description: varchar({ length: 255 }).notNull(),
|
||||||
|
imageUrl: varchar({ length: 255 }).notNull(),
|
||||||
|
keywords: text("text[]").array(),
|
||||||
createdAt: timestamp().notNull().defaultNow(),
|
createdAt: timestamp().notNull().defaultNow(),
|
||||||
updatedAt: timestamp().notNull().defaultNow(),
|
updatedAt: timestamp().notNull().defaultNow(),
|
||||||
})
|
})
|
@ -1,5 +1,7 @@
|
|||||||
import { checkIngest } from "./ingest/ingest"
|
import { checkIngest } from "./ingest/ingest"
|
||||||
|
import checkIfScrapeAllowed from "./safety/preCheck"
|
||||||
import { clearScreen, truncate } from "./util/osFunctions"
|
import { clearScreen, truncate } from "./util/osFunctions"
|
||||||
|
import { saveToDatabase, scrapeUrl } from "./util/scrape"
|
||||||
import getRandomUrl from "./util/toScrape"
|
import getRandomUrl from "./util/toScrape"
|
||||||
import * as readline from "readline"
|
import * as readline from "readline"
|
||||||
|
|
||||||
@ -16,6 +18,7 @@ function promptUser(question: string): Promise<string> {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
clearScreen()
|
||||||
checkIngest()
|
checkIngest()
|
||||||
console.log()
|
console.log()
|
||||||
|
|
||||||
@ -34,13 +37,25 @@ async function main() {
|
|||||||
console.log("│ NAVIGATE SCRAPER │")
|
console.log("│ NAVIGATE SCRAPER │")
|
||||||
console.log("├───────────────────────────────────────────────┤")
|
console.log("├───────────────────────────────────────────────┤")
|
||||||
console.log(`│ URL: ${truncate(url, { length: 40 })}... │`)
|
console.log(`│ URL: ${truncate(url, { length: 40 })}... │`)
|
||||||
|
console.log("│ Pre-check: ", await checkIfScrapeAllowed(url) ? "Allowed" : "Blocked", " │")
|
||||||
console.log("┢━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┪")
|
console.log("┢━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┪")
|
||||||
console.log("┃ [S]crape ┃ [Q]uit ┃")
|
console.log("┃ [S]crape ┃ [Q]uit ┃")
|
||||||
console.log("┗━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━┛\n")
|
console.log("┗━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━┛\n")
|
||||||
|
|
||||||
const input = await promptUser("> ")
|
const input = await promptUser("> ")
|
||||||
if (input === "s") {
|
if (input === "s") {
|
||||||
console.log("I would scrape now...")
|
const { title, description, imageUrl, keywordsArray } = await scrapeUrl(url)
|
||||||
|
|
||||||
|
console.log(`\nTitle: ${title}`)
|
||||||
|
console.log(`Description: ${description}`)
|
||||||
|
console.log(`Image URL: ${imageUrl}`)
|
||||||
|
console.log(`Keywords: ${keywordsArray.join(", ")}\n`)
|
||||||
|
|
||||||
|
console.log("Save to database? (y/n)")
|
||||||
|
const save = await promptUser("> ")
|
||||||
|
if (save === "y") {
|
||||||
|
await saveToDatabase(url, title, description, imageUrl, keywordsArray)
|
||||||
|
}
|
||||||
} else if (input === "q") {
|
} else if (input === "q") {
|
||||||
clearScreen()
|
clearScreen()
|
||||||
console.log("\nExiting...\n")
|
console.log("\nExiting...\n")
|
||||||
|
18
scraper/src/safety/preCheck.ts
Normal file
18
scraper/src/safety/preCheck.ts
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
import robotsParser from 'robots-txt-parser';
|
||||||
|
|
||||||
|
const robots = robotsParser(
|
||||||
|
{
|
||||||
|
userAgent: 'NavigateBot',
|
||||||
|
allowOnNeutral: false,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
export default async function checkIfScrapeAllowed(url: string) {
|
||||||
|
try {
|
||||||
|
await robots.useRobotsFor(url)
|
||||||
|
return robots.canCrawl(url)
|
||||||
|
} catch (error) {
|
||||||
|
console.error("[!]", error)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
43
scraper/src/util/scrape.ts
Normal file
43
scraper/src/util/scrape.ts
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
import axios from "axios"
|
||||||
|
import "dotenv/config"
|
||||||
|
import { NodePgDatabase } from "drizzle-orm/node-postgres"
|
||||||
|
import { drizzle } from "drizzle-orm/node-postgres"
|
||||||
|
import { searchData } from "../db/schema"
|
||||||
|
import * as cheerio from "cheerio"
|
||||||
|
import * as schema from "../db/schema"
|
||||||
|
import { Pool } from "pg"
|
||||||
|
|
||||||
|
const pool = new Pool({
|
||||||
|
connectionString: process.env.DATABASE_URL,
|
||||||
|
})
|
||||||
|
|
||||||
|
const db = drizzle(pool, { schema }) as NodePgDatabase<typeof schema>
|
||||||
|
|
||||||
|
export async function scrapeUrl(url: string) {
|
||||||
|
const response = await axios.get(url)
|
||||||
|
const $ = cheerio.load(response.data)
|
||||||
|
|
||||||
|
// Data
|
||||||
|
const title = $("title").text()
|
||||||
|
const description = $("meta[name='description']").attr("content") || ""
|
||||||
|
const imageUrl = $("meta[property='og:image']").attr("content") || ""
|
||||||
|
const keywords = $("meta[name='keywords']").attr("content")
|
||||||
|
|
||||||
|
// Extract keywords
|
||||||
|
let keywordsArray: string[] = []
|
||||||
|
if (keywords) {
|
||||||
|
keywordsArray = keywords.split(",")
|
||||||
|
}
|
||||||
|
|
||||||
|
return { title, description, imageUrl, keywordsArray }
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function saveToDatabase(url: string, title: string, description: string, imageUrl: string, keywordsArray: string[]) {
|
||||||
|
await db.insert(searchData).values({
|
||||||
|
url,
|
||||||
|
title,
|
||||||
|
description,
|
||||||
|
imageUrl,
|
||||||
|
keywords: keywordsArray,
|
||||||
|
})
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user