initial commit

This commit is contained in:
2024-09-13 15:12:30 -07:00
commit ab9f71ca86
6 changed files with 240 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
*~
html-src/
ebooks/
*.csv

42
README.md Normal file
View File

@@ -0,0 +1,42 @@
Project Gutenberg Mirror
========================
This project aims to create a simple, searchable, updatable mirror of
[Project Gutenberg](https://www.gutenberg.org)'s ebook collection. It
downloads the HTML source for all books in the collection, converts them to
ePub and Kindle formats, and provides a simple web interface to search for
books and download them.
update-mirror.sh
----------------
This script downloads books (in HTML, with images) to the html-src
directory. As of 13 September 2024, the collection includes 61878 titles
and needs 133.2 GB of disk space.
It also downloads pg_catalog.csv, which provides the input to the next
stage:
db_update.py
------------
This script processes pg_catalog.csv, sorting its contents into a MariaDB
database to facilitate searching: on authors, subjects, titles, etc.
Database configuration is passed to it through environment variables.
gutenberg_mirror.sql
--------------------
This is the database schema, which will need to be loaded into MariaDB
before running db_update.py.
html_to_epub.sh
---------------
This script converts books from HTML to ePub and Kindle formats. It relies
on a [containerized version](https://gitlab.alfter.us/salfter/ebookmaker) of
Project Gutenberg's Ebookmaker tool. The server I'm using for this mirror
is basically a Docker host with a bunch of containers on it. At some point,
everything here will most likely get bundled up into one container for ease
of installation.

76
db_update.py Normal file
View File

@@ -0,0 +1,76 @@
#!/usr/bin/env python
import csv
import mariadb
import signal
import sys
from os import environ
# fields in pg_catalog.csv:
# Text# Type Issued Title Language Authors Subjects LoCC Bookshelves
bookdata={}
# close out on Ctrl-C
def signal_handler(sig, frame):
conn.close()
sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
# read SCV
with open("pg_catalog.csv") as f:
rdr=csv.DictReader(f)
for row in rdr:
bookdata[int(row["Text#"])]=row
# connect to database
try:
conn=mariadb.connect(user=environ.get("user"), password=environ.get("password"), host=environ.get("host"), database=environ.get("database"))
except mariadb.Error as e:
print(f"database connection error: {e}")
sys.exit(1)
cur=conn.cursor()
# cycle through the rows
for row in bookdata:
print(row)
try: # skip if we already have it
cur.execute("insert into books select ?, ?, ?", (row, bookdata[row]["Title"], bookdata[row]["LoCC"]))
for subj in bookdata[row]["Subjects"].split("; "):
try:
cur.execute("insert into subjects select ?, ?", (row, subj))
except mariadb.Error as e:
print(f"database error: {e}")
for author in bookdata[row]["Authors"].split("; "):
try:
cur.execute("insert into authors select ?, ?", (row, author))
except mariadb.Error as e:
print(f"database error: {e}")
for lang in bookdata[row]["Language"].split("; "):
try:
cur.execute("insert into languages select ?, ?", (row, lang))
except mariadb.Error as e:
print(f"database error: {e}")
for shelf in bookdata[row]["Bookshelves"].split("; "):
try:
cur.execute("insert into shelves select ?, ?", (row, shelf))
except mariadb.Error as e:
print(f"database error: {e}")
conn.commit()
except mariadb.Error as e:
pass
conn.close()

95
gutenberg_mirror.sql Normal file
View File

@@ -0,0 +1,95 @@
/*M!999999\- enable the sandbox mode */
-- MariaDB dump 10.19-11.5.2-MariaDB, for Linux (x86_64)
--
-- Host: server Database: gutenberg_mirror
-- ------------------------------------------------------
-- Server version 11.4.3-MariaDB-ubu2404
/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
/*!40101 SET NAMES utf8mb4 */;
/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
/*!40103 SET TIME_ZONE='+00:00' */;
/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
/*M!100616 SET @OLD_NOTE_VERBOSITY=@@NOTE_VERBOSITY, NOTE_VERBOSITY=0 */;
--
-- Table structure for table `authors`
--
DROP TABLE IF EXISTS `authors`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!40101 SET character_set_client = utf8 */;
CREATE TABLE `authors` (
`book_id` int(11) NOT NULL,
`author` varchar(256) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_uca1400_ai_ci;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `books`
--
DROP TABLE IF EXISTS `books`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!40101 SET character_set_client = utf8 */;
CREATE TABLE `books` (
`book_id` int(11) NOT NULL,
`title` varchar(256) NOT NULL,
`locc` varchar(16) DEFAULT NULL,
PRIMARY KEY (`book_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_uca1400_ai_ci;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `languages`
--
DROP TABLE IF EXISTS `languages`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!40101 SET character_set_client = utf8 */;
CREATE TABLE `languages` (
`book_id` int(11) NOT NULL,
`language` varchar(8) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_uca1400_ai_ci;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `shelves`
--
DROP TABLE IF EXISTS `shelves`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!40101 SET character_set_client = utf8 */;
CREATE TABLE `shelves` (
`book_id` int(11) NOT NULL,
`shelf` varchar(256) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_uca1400_ai_ci;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `subjects`
--
DROP TABLE IF EXISTS `subjects`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!40101 SET character_set_client = utf8 */;
CREATE TABLE `subjects` (
`book_id` int(11) NOT NULL,
`subject` varchar(256) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_uca1400_ai_ci;
/*!40101 SET character_set_client = @saved_cs_client */;
/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;
/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
/*M!100616 SET NOTE_VERBOSITY=@OLD_NOTE_VERBOSITY */;
-- Dump completed on 2024-09-13 7:54:38

18
html_to_epub.sh Normal file
View File

@@ -0,0 +1,18 @@
#!/usr/bin/env bash
for srcfile in $(find html-src -name \*.htm)
do
prefix=$(basename -s -h.htm $srcfile)
echo $prefix
destpath=$(echo $(dirname $(dirname $srcfile)) | sed "s/^html-src/ebooks/")
mkdir -p $destpath
if [ $srcfile -nt $destpath/$prefix.epub ]
then
docker run -it --rm -v $(pwd):/data cr.gitlab.alfter.us:443/salfter/ebookmaker --make kindle.images --generate_cover --output-dir /data/$destpath /data/$srcfile 2>&1 >/dev/null
coverfile=$(ls $destpath/*.png)
docker run -it --rm -v $(pwd):/data cr.gitlab.alfter.us:443/salfter/ebookmaker --make kindle.images --cover /data/$coverfile --output-dir /data/$destpath /data/$srcfile 2>&1 >/dev/null
mv $destpath/*.epub $destpath/$prefix.epub
mv $destpath/*.mobi $destpath/$prefix.mobi
rm -f $coverfile 2>&1 >/dev/null
fi
done

5
update-mirror.sh Normal file
View File

@@ -0,0 +1,5 @@
#!/usr/bin/env bash
rsync -avz aleph.gutenberg.org::gutenberg-epub/feeds/pg_catalog.csv .
rsync -avz --delete --delete-excluded --exclude "/images/" --exclude "/pg/" --exclude "/retired/" --exclude "/cache/" --exclude "*/*-page-images/*" --exclude "*/*-page-images/" --exclude "*/old/*" --exclude "*/old/" --include "*/" --include "*.htm" --include "*.jpg" --include "*.png" --include "*.gif" --exclude "*" aleph.gutenberg.org::gutenberg html-src
rm html-src/hosted-by-ibiblio.png
#rsync -avz --delete --delete-excluded --exclude "/images/" --exclude "/pg/" --exclude "/retired/" --exclude "/cache/" --exclude "*/*-h/*" --exclude "*/*-h/" --exclude "*/old/*" --exclude "*/old/" --include "*/" --include "*-8.txt" --exclude "*" aleph.gutenberg.org::gutenberg txt-src