initial commit
This commit is contained in:
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
*~
|
||||
html-src/
|
||||
ebooks/
|
||||
*.csv
|
||||
42
README.md
Normal file
42
README.md
Normal file
@@ -0,0 +1,42 @@
|
||||
Project Gutenberg Mirror
|
||||
========================
|
||||
|
||||
This project aims to create a simple, searchable, updatable mirror of
|
||||
[Project Gutenberg](https://www.gutenberg.org)'s ebook collection. It
|
||||
downloads the HTML source for all books in the collection, converts them to
|
||||
ePub and Kindle formats, and provides a simple web interface to search for
|
||||
books and download them.
|
||||
|
||||
update-mirror.sh
|
||||
----------------
|
||||
|
||||
This script downloads books (in HTML, with images) to the html-src
|
||||
directory. As of 13 September 2024, the collection includes 61878 titles
|
||||
and needs 133.2 GB of disk space.
|
||||
|
||||
It also downloads pg_catalog.csv, which provides the input to the next
|
||||
stage:
|
||||
|
||||
db_update.py
|
||||
------------
|
||||
|
||||
This script processes pg_catalog.csv, sorting its contents into a MariaDB
|
||||
database to facilitate searching: on authors, subjects, titles, etc.
|
||||
Database configuration is passed to it through environment variables.
|
||||
|
||||
gutenberg_mirror.sql
|
||||
--------------------
|
||||
|
||||
This is the database schema, which will need to be loaded into MariaDB
|
||||
before running db_update.py.
|
||||
|
||||
html_to_epub.sh
|
||||
---------------
|
||||
|
||||
This script converts books from HTML to ePub and Kindle formats. It relies
|
||||
on a [containerized version](https://gitlab.alfter.us/salfter/ebookmaker) of
|
||||
Project Gutenberg's Ebookmaker tool. The server I'm using for this mirror
|
||||
is basically a Docker host with a bunch of containers on it. At some point,
|
||||
everything here will most likely get bundled up into one container for ease
|
||||
of installation.
|
||||
|
||||
76
db_update.py
Normal file
76
db_update.py
Normal file
@@ -0,0 +1,76 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import csv
|
||||
import mariadb
|
||||
import signal
|
||||
import sys
|
||||
from os import environ
|
||||
|
||||
# fields in pg_catalog.csv:
|
||||
# Text# Type Issued Title Language Authors Subjects LoCC Bookshelves
|
||||
|
||||
bookdata={}
|
||||
|
||||
# close out on Ctrl-C
|
||||
|
||||
def signal_handler(sig, frame):
|
||||
conn.close()
|
||||
sys.exit(0)
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
# read SCV
|
||||
|
||||
with open("pg_catalog.csv") as f:
|
||||
rdr=csv.DictReader(f)
|
||||
for row in rdr:
|
||||
bookdata[int(row["Text#"])]=row
|
||||
|
||||
# connect to database
|
||||
|
||||
try:
|
||||
conn=mariadb.connect(user=environ.get("user"), password=environ.get("password"), host=environ.get("host"), database=environ.get("database"))
|
||||
except mariadb.Error as e:
|
||||
print(f"database connection error: {e}")
|
||||
sys.exit(1)
|
||||
cur=conn.cursor()
|
||||
|
||||
# cycle through the rows
|
||||
|
||||
for row in bookdata:
|
||||
print(row)
|
||||
|
||||
try: # skip if we already have it
|
||||
cur.execute("insert into books select ?, ?, ?", (row, bookdata[row]["Title"], bookdata[row]["LoCC"]))
|
||||
|
||||
for subj in bookdata[row]["Subjects"].split("; "):
|
||||
try:
|
||||
cur.execute("insert into subjects select ?, ?", (row, subj))
|
||||
except mariadb.Error as e:
|
||||
print(f"database error: {e}")
|
||||
|
||||
for author in bookdata[row]["Authors"].split("; "):
|
||||
try:
|
||||
cur.execute("insert into authors select ?, ?", (row, author))
|
||||
except mariadb.Error as e:
|
||||
print(f"database error: {e}")
|
||||
|
||||
for lang in bookdata[row]["Language"].split("; "):
|
||||
try:
|
||||
cur.execute("insert into languages select ?, ?", (row, lang))
|
||||
except mariadb.Error as e:
|
||||
print(f"database error: {e}")
|
||||
|
||||
for shelf in bookdata[row]["Bookshelves"].split("; "):
|
||||
try:
|
||||
cur.execute("insert into shelves select ?, ?", (row, shelf))
|
||||
except mariadb.Error as e:
|
||||
print(f"database error: {e}")
|
||||
|
||||
conn.commit()
|
||||
|
||||
except mariadb.Error as e:
|
||||
pass
|
||||
|
||||
conn.close()
|
||||
|
||||
95
gutenberg_mirror.sql
Normal file
95
gutenberg_mirror.sql
Normal file
@@ -0,0 +1,95 @@
|
||||
/*M!999999\- enable the sandbox mode */
|
||||
-- MariaDB dump 10.19-11.5.2-MariaDB, for Linux (x86_64)
|
||||
--
|
||||
-- Host: server Database: gutenberg_mirror
|
||||
-- ------------------------------------------------------
|
||||
-- Server version 11.4.3-MariaDB-ubu2404
|
||||
|
||||
/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
|
||||
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
|
||||
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
|
||||
/*!40101 SET NAMES utf8mb4 */;
|
||||
/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
|
||||
/*!40103 SET TIME_ZONE='+00:00' */;
|
||||
/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
|
||||
/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
|
||||
/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
|
||||
/*M!100616 SET @OLD_NOTE_VERBOSITY=@@NOTE_VERBOSITY, NOTE_VERBOSITY=0 */;
|
||||
|
||||
--
|
||||
-- Table structure for table `authors`
|
||||
--
|
||||
|
||||
DROP TABLE IF EXISTS `authors`;
|
||||
/*!40101 SET @saved_cs_client = @@character_set_client */;
|
||||
/*!40101 SET character_set_client = utf8 */;
|
||||
CREATE TABLE `authors` (
|
||||
`book_id` int(11) NOT NULL,
|
||||
`author` varchar(256) NOT NULL
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_uca1400_ai_ci;
|
||||
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||
|
||||
--
|
||||
-- Table structure for table `books`
|
||||
--
|
||||
|
||||
DROP TABLE IF EXISTS `books`;
|
||||
/*!40101 SET @saved_cs_client = @@character_set_client */;
|
||||
/*!40101 SET character_set_client = utf8 */;
|
||||
CREATE TABLE `books` (
|
||||
`book_id` int(11) NOT NULL,
|
||||
`title` varchar(256) NOT NULL,
|
||||
`locc` varchar(16) DEFAULT NULL,
|
||||
PRIMARY KEY (`book_id`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_uca1400_ai_ci;
|
||||
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||
|
||||
--
|
||||
-- Table structure for table `languages`
|
||||
--
|
||||
|
||||
DROP TABLE IF EXISTS `languages`;
|
||||
/*!40101 SET @saved_cs_client = @@character_set_client */;
|
||||
/*!40101 SET character_set_client = utf8 */;
|
||||
CREATE TABLE `languages` (
|
||||
`book_id` int(11) NOT NULL,
|
||||
`language` varchar(8) NOT NULL
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_uca1400_ai_ci;
|
||||
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||
|
||||
--
|
||||
-- Table structure for table `shelves`
|
||||
--
|
||||
|
||||
DROP TABLE IF EXISTS `shelves`;
|
||||
/*!40101 SET @saved_cs_client = @@character_set_client */;
|
||||
/*!40101 SET character_set_client = utf8 */;
|
||||
CREATE TABLE `shelves` (
|
||||
`book_id` int(11) NOT NULL,
|
||||
`shelf` varchar(256) NOT NULL
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_uca1400_ai_ci;
|
||||
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||
|
||||
--
|
||||
-- Table structure for table `subjects`
|
||||
--
|
||||
|
||||
DROP TABLE IF EXISTS `subjects`;
|
||||
/*!40101 SET @saved_cs_client = @@character_set_client */;
|
||||
/*!40101 SET character_set_client = utf8 */;
|
||||
CREATE TABLE `subjects` (
|
||||
`book_id` int(11) NOT NULL,
|
||||
`subject` varchar(256) NOT NULL
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_uca1400_ai_ci;
|
||||
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||
/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
|
||||
|
||||
/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
|
||||
/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
|
||||
/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;
|
||||
/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
|
||||
/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
|
||||
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
|
||||
/*M!100616 SET NOTE_VERBOSITY=@OLD_NOTE_VERBOSITY */;
|
||||
|
||||
-- Dump completed on 2024-09-13 7:54:38
|
||||
18
html_to_epub.sh
Normal file
18
html_to_epub.sh
Normal file
@@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
for srcfile in $(find html-src -name \*.htm)
|
||||
do
|
||||
prefix=$(basename -s -h.htm $srcfile)
|
||||
echo $prefix
|
||||
destpath=$(echo $(dirname $(dirname $srcfile)) | sed "s/^html-src/ebooks/")
|
||||
mkdir -p $destpath
|
||||
if [ $srcfile -nt $destpath/$prefix.epub ]
|
||||
then
|
||||
docker run -it --rm -v $(pwd):/data cr.gitlab.alfter.us:443/salfter/ebookmaker --make kindle.images --generate_cover --output-dir /data/$destpath /data/$srcfile 2>&1 >/dev/null
|
||||
coverfile=$(ls $destpath/*.png)
|
||||
docker run -it --rm -v $(pwd):/data cr.gitlab.alfter.us:443/salfter/ebookmaker --make kindle.images --cover /data/$coverfile --output-dir /data/$destpath /data/$srcfile 2>&1 >/dev/null
|
||||
mv $destpath/*.epub $destpath/$prefix.epub
|
||||
mv $destpath/*.mobi $destpath/$prefix.mobi
|
||||
rm -f $coverfile 2>&1 >/dev/null
|
||||
fi
|
||||
done
|
||||
5
update-mirror.sh
Normal file
5
update-mirror.sh
Normal file
@@ -0,0 +1,5 @@
|
||||
#!/usr/bin/env bash
|
||||
rsync -avz aleph.gutenberg.org::gutenberg-epub/feeds/pg_catalog.csv .
|
||||
rsync -avz --delete --delete-excluded --exclude "/images/" --exclude "/pg/" --exclude "/retired/" --exclude "/cache/" --exclude "*/*-page-images/*" --exclude "*/*-page-images/" --exclude "*/old/*" --exclude "*/old/" --include "*/" --include "*.htm" --include "*.jpg" --include "*.png" --include "*.gif" --exclude "*" aleph.gutenberg.org::gutenberg html-src
|
||||
rm html-src/hosted-by-ibiblio.png
|
||||
#rsync -avz --delete --delete-excluded --exclude "/images/" --exclude "/pg/" --exclude "/retired/" --exclude "/cache/" --exclude "*/*-h/*" --exclude "*/*-h/" --exclude "*/old/*" --exclude "*/old/" --include "*/" --include "*-8.txt" --exclude "*" aleph.gutenberg.org::gutenberg txt-src
|
||||
Reference in New Issue
Block a user