initial release
This commit is contained in:
185
api-gateway/src/providers/downloader.provider.ts
Normal file
185
api-gateway/src/providers/downloader.provider.ts
Normal file
@@ -0,0 +1,185 @@
|
||||
import axios from 'axios';
|
||||
import { randomUUID } from 'crypto';
|
||||
import * as fs from 'fs';
|
||||
import { parse } from 'node-html-parser';
|
||||
import config from '../config/config.js';
|
||||
|
||||
export namespace DownloaderProvider {
|
||||
const bibleserver_endpoint = 'https://www.bibleserver.com';
|
||||
|
||||
export class Downloader {
|
||||
private _translation: string;
|
||||
private _logs: string[] = [];
|
||||
private _status: 'idle' | 'running' | 'completed' | 'error' = 'idle';
|
||||
private _books: string[] = [];
|
||||
private _data_directory: string = 'data';
|
||||
private _operation_id: string = ''
|
||||
private _time_start: Date | undefined;
|
||||
private _time_end: Date | undefined;
|
||||
|
||||
public get translation() {
|
||||
return this._translation
|
||||
}
|
||||
|
||||
public get uuid() {
|
||||
return this._operation_id
|
||||
}
|
||||
|
||||
public get logs() {
|
||||
return this._logs
|
||||
}
|
||||
|
||||
public get time_start() {
|
||||
return this._time_start
|
||||
}
|
||||
|
||||
public get time_end() {
|
||||
return this._time_end
|
||||
}
|
||||
|
||||
constructor(private translation_in: string) {
|
||||
this._translation = translation_in;
|
||||
this._operation_id = randomUUID()
|
||||
this._status = 'idle';
|
||||
this.log(`Initialized downloader for translation: ${this._translation}`);
|
||||
this.log(`Using bibleserver endpoint: ${bibleserver_endpoint}`);
|
||||
|
||||
}
|
||||
|
||||
public start() {
|
||||
this._time_start = new Date();
|
||||
this.log(`Starting download for translation: ${this._translation}`);
|
||||
this._status = 'running';
|
||||
// get all translations from reference file
|
||||
try {
|
||||
const book_list_file = `${this._data_directory}/books.json`;
|
||||
this.log(`Loading book list from ${book_list_file}`);
|
||||
this._books = JSON.parse(fs.readFileSync(book_list_file, 'utf-8')).books;
|
||||
this.log(`Loaded ${this._books.length} books to download`);
|
||||
} catch (error) {
|
||||
this.log(`Error loading book list: ${error}`);
|
||||
this._status = 'error';
|
||||
return;
|
||||
}
|
||||
// create directory
|
||||
try {
|
||||
if (!fs.existsSync(`${this._data_directory}/${this._translation}`)) {
|
||||
fs.mkdirSync(`${this._data_directory}/${this._translation}`, { recursive: true });
|
||||
this.log(`Created directory: ${this._data_directory}/${this._translation}`);
|
||||
}
|
||||
} catch (error) {
|
||||
this.log(`Error creating translation directory: ${error}`);
|
||||
this._status = 'error';
|
||||
return;
|
||||
}
|
||||
|
||||
this.fetch_all_books();
|
||||
|
||||
}
|
||||
|
||||
public getStatus(): 'idle' | 'running' | 'completed' | 'error' {
|
||||
return this._status;
|
||||
}
|
||||
|
||||
private delay(ms: number): Promise<void> {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
private async fetch_all_books() {
|
||||
try {
|
||||
for (let book of this._books) {
|
||||
if (fs.existsSync(`${this._data_directory}/${this._translation}/${book}.json`)) {
|
||||
this.log(`Book ${book} already exists for ${this._translation}, skipping`);
|
||||
continue;
|
||||
}
|
||||
this.log(`Fetching book: ${book} for translation: ${this._translation}`);
|
||||
|
||||
let chapter = 1;
|
||||
let book_content: any[] = [];
|
||||
|
||||
while (true) {
|
||||
this.log(`Trying to fetch chapter ${chapter} of book ${book}`);
|
||||
await this.delay(config.downloadDelay * 1000); // sleep to prevent DDOS
|
||||
let chapter_content: any = await this.fetch_chapter(book, chapter);
|
||||
if (chapter_content.length === 0) {
|
||||
break;
|
||||
}
|
||||
book_content = book_content.concat(chapter_content);
|
||||
chapter++;
|
||||
}
|
||||
fs.writeFileSync(`${this._data_directory}/${this._translation}/${book}.json`, JSON.stringify(book_content, null, 4));
|
||||
this.log(`Saved book ${book} for translation ${this._translation} with ${book_content.length} verses`);
|
||||
this.log(`Completed fetching book: ${book} for translation: ${this._translation}`);
|
||||
}
|
||||
this._time_end = new Date();
|
||||
this._status = "completed"
|
||||
} catch (error) {
|
||||
this.log(`Error fetching books: ${error}`);
|
||||
this._status = 'error';
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
private async fetch_chapter(book: string, chapter: number) {
|
||||
let bibleserver_url = `${bibleserver_endpoint}/${this._translation}/${book}${chapter}`;
|
||||
this.log(`Fetching URL from: ${bibleserver_url}`)
|
||||
try {
|
||||
let response = await axios.get(bibleserver_url)
|
||||
this.log("Received response")
|
||||
let html = response.data;
|
||||
let root = parse(html);
|
||||
// verify to avoid redirect
|
||||
let book_verify_name = root.querySelector('.chapter')?.querySelector('header')?.querySelector('h1')?.text.trim() || '';
|
||||
if (book_verify_name !== `${book} ${chapter}`) {
|
||||
// chapter does not exist, return empty list
|
||||
return [];
|
||||
}
|
||||
let verse_elements = root.querySelectorAll('.verse');
|
||||
let result_array = [];
|
||||
for (let verse_element of verse_elements) {
|
||||
verse_element.querySelectorAll('.footnote').forEach(fn => fn.remove()); // remove footnotes
|
||||
let verse_raw = verse_element.querySelector('.verse-number')?.childNodes[0].text
|
||||
// resolve verse ranges
|
||||
if (verse_raw?.includes('-')) {
|
||||
let ranges = verse_raw.split('-');
|
||||
for (let v = parseInt(ranges[0]); v <= parseInt(ranges[1]); v++) {
|
||||
result_array.push({
|
||||
translation: this._translation,
|
||||
book: book,
|
||||
chapter: chapter,
|
||||
verse: v,
|
||||
text: verse_element.querySelector('.verse-content')?.childNodes[0].text || ''
|
||||
});
|
||||
}
|
||||
} else {
|
||||
result_array.push({
|
||||
translation: this._translation,
|
||||
book: book,
|
||||
chapter: chapter,
|
||||
verse: Number(verse_element.querySelector('.verse-number')?.childNodes[0].text) || -1,
|
||||
text: verse_element.querySelector('.verse-content')?.childNodes[0].text || ''
|
||||
});
|
||||
}
|
||||
}
|
||||
return result_array
|
||||
} catch (error: any) {
|
||||
if (error.response && error.response.status === 404) {
|
||||
// translation does not exist
|
||||
this.log(`Translation ${this._translation} does not exist for book ${book}`);
|
||||
return [];
|
||||
} else {
|
||||
this.log(`Error fetching ${this._translation} ${book} ${chapter}- ${error}`);
|
||||
this.log(`URL: ${bibleserver_url}`);
|
||||
throw (error)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private log(message: string) {
|
||||
let log_entry = `[Downloader<${this._operation_id}>:${this._translation}][${new Date().toISOString()}] ${message}`;
|
||||
this._logs.push(log_entry);
|
||||
console.log(log_entry);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user