feat: migrate mnq sqs tutorial source code to serverless examples (#70)

mokuhasushi · Antonio Tirone · web-flow · commit 4680b33cd239 · 2024-01-31T15:34:06.000+01:00
* feat: migrate mnq sqs tutorial source code to serverless examples

* feat: zip archives via terraform, tf outputs

---------

Co-authored-by: Antonio Tirone &lt;atirone@scaleway.com&gt;
diff --git a/README.md b/README.md
@@ -82,7 +82,8 @@ Table of Contents:
 | **[Kong API Gateway](projects/kong-api-gateway/README.md)** <br/> Deploying a Kong Gateway on containers to provide routing to functions. | CaaS & FaaS | Python   | [Serverless Framework] |
 | **[Serverless Gateway](https://github.com/scaleway/serverless-gateway)** <br/> Our serverless gateway for functions and containers.       | API Gateway | Python   | [Python API Framework] |
 | **[Monitoring Glaciers](projects/blogpost-glacier/README.md)** <br/> A project to monitor glaciers and the impact of global warming.      | S3 & RDB    | Golang   | [Serverless Framework] |
-| **[Manage large message](projects/large-messages/README.md)** <br/> An example of infrastructure to manage large messages.      | PaaS & S3    | Python   | [Terraform] |
+| **[Manage large message](projects/large-messages/README.md)** <br/> An example of infrastructure to manage large messages.                | PaaS & S3   | Python   | [Terraform]            |
+| **[Serverless scraping](projects/serverless-scraping/README.md)** <br/> An example of infrastructure to scrape the hackernews website.    | PaaS & RDB  | Python   | [Terraform]            |
 
 [Serverless Framework]: https://github.com/scaleway/serverless-scaleway-functions
 [Terraform]: https://registry.terraform.io/providers/scaleway/scaleway/latest/docs
diff --git a/projects/serverless-scraping/.gitignore b/projects/serverless-scraping/.gitignore
@@ -0,0 +1,16 @@
+venv/
+.env
+*.zip
+package/
+
+# terraform
+**/.terraform/*
+
+*.tfstate
+*.tfstate.*
+
+crash.log
+crash.*.log
+
+*.tfvars
+*.tfvars.json
diff --git a/projects/serverless-scraping/README.md b/projects/serverless-scraping/README.md
@@ -0,0 +1,37 @@
+# Create a serverless scraping architecture
+
+This is the source code for the tutorial: [Create a serverless scraping architecture, with Scaleway Messaging and Queuing SQS, Serverless Functions and Managed Database](https://www.scaleway.com/en/docs/tutorials/create-serverless-scraping).
+
+In this tutorial we show how to set up a simple application which reads [Hacker News](https://news.ycombinator.com/news) and processes the articles it finds there asynchronously, using Scaleway serverless products. 
+
+## Requirements
+
+This example assumes you are familiar with how serverless functions work. If needed, you can
+check [Scaleway official documentation](https://www.scaleway.com/en/docs/serverless/functions/quickstart/)
+
+This example is written using Python and Terraform, and assumes you have [set up authentication for the Terraform provider](https://registry.terraform.io/providers/scaleway/scaleway/latest/docs#authentication). 
+
+
+## Context
+
+**The architecture deployed in this tutorial consists of two functions, two triggers, a SQS queue, and a RDB instance.**
+*The producer function, activated by a recurrent cron trigger, scrapes HackerNews for articles published in the last 15 minutes and pushes the title and URL of the articles to an SQS queue created with Scaleway Messaging and Queuing.*
+*The consumer function, triggered by each new message on the SQS queue, consumes messages published to the queue, scrapes some data from the linked article, and then writes the data into a Scaleway Managed Database.*
+
+
+## Setup
+Once you have cloned this repository, you just need to deploy using Terraform. 
+```bash
+terraform init
+terraform apply
+```
+
+
+## Running
+
+Everything is already up and running! 
+You can check correct execution by using the Scaleway cockpit, and by connecting to your RDB instance to see results.
+
+```bash
+psql -h $(terraform output -raw db_ip) --port $(terraform output -raw db_port) -d hn-database -U worker
+```
diff --git a/projects/serverless-scraping/archives/.gitignore b/projects/serverless-scraping/archives/.gitignore
@@ -0,0 +1,2 @@
+# Ignore everything in this directory except this file
+!.gitignore
diff --git a/projects/serverless-scraping/consumer/handlers/consumer.py b/projects/serverless-scraping/consumer/handlers/consumer.py
@@ -0,0 +1,73 @@
+import json 
+import os
+import pg8000.native
+import requests
+from bs4 import BeautifulSoup
+
+db_host = os.getenv('DB_HOST')
+db_port = os.getenv('DB_PORT')
+db_name = os.getenv('DB_NAME')
+db_user = os.getenv('DB_USER')
+db_password = os.getenv('DB_PASSWORD')
+
+CREATE_TABLE_IF_NOT_EXISTS = """
+CREATE TABLE IF NOT EXISTS articles (
+    id SERIAL PRIMARY KEY,
+    title VARCHAR(255) NOT NULL,
+    url VARCHAR(255) NOT NULL,
+    a_count INTEGER NOT NULL,
+    h1_count INTEGER NOT NULL, 
+    p_count INTEGER NOT NULL
+);"""
+
+INSERT_INTO_ARTICLES = """
+INSERT INTO articles (title, url, a_count, h1_count, p_count)
+VALUES(:title, :url, :a_count, :h1_count, :p_count) RETURNING id
+;"""
+
+def scrape_page_for_stats(url):
+    """
+    Scrape page at given url and return stats about chosen tags
+    """
+    # articles hosted on hn have a relative url
+    if url[:4] == "item":
+        url = "https://news.ycombinator.com/" + url
+
+    page = requests.get(url, timeout=15)
+    html_doc = page.content
+    soup = BeautifulSoup(html_doc, 'html.parser')
+    
+    tags = ['a', 'h1', 'p']
+
+    return {tag: len(soup.find_all(tag)) for tag in tags}
+
+def scrape_and_save_to_db(event):
+    """
+    Scrape a page for info and save such infos in db
+    """
+    body = json.loads(event["body"])
+
+    tags_count = scrape_page_for_stats(body['url'])
+    conn = None
+    try: 
+        conn = pg8000.native.Connection(host=db_host, database=db_name, port=db_port, user=db_user, password=db_password, timeout=15)
+
+        # Where else could we create the table, to avoid manual intervention? 
+        conn.run(CREATE_TABLE_IF_NOT_EXISTS)
+        conn.run(INSERT_INTO_ARTICLES, title=body['title'], url=body['url'], a_count=tags_count['a'], h1_count=tags_count['h1'], p_count=tags_count['p'])
+
+    finally:
+        if conn is not None:
+            conn.close()
+    return 200
+
+def handle(event, context):
+    try:
+        status = scrape_and_save_to_db(event)
+        return {'statusCode': status, 'headers': {'content': 'text/plain'}}
+    except Exception as e:
+        print("error", e)
+        return {'statusCode': 500, 'body': str(e)}
+
+if __name__ == '__main__':
+    handle({'body': json.dumps({'url': 'https://google.com', 'title': 'test url'})}, None)
diff --git a/projects/serverless-scraping/consumer/requirements.txt b/projects/serverless-scraping/consumer/requirements.txt
@@ -0,0 +1,3 @@
+pg8000
+requests
+bs4
diff --git a/projects/serverless-scraping/scraper/handlers/scrape_hn.py b/projects/serverless-scraping/scraper/handlers/scrape_hn.py
@@ -0,0 +1,51 @@
+import requests
+import boto3
+import json 
+import os
+from datetime import datetime, timedelta
+from bs4 import BeautifulSoup
+
+HN_URL = "https://news.ycombinator.com/newest"
+SCW_SQS_URL = "https://sqs.mnq.fr-par.scaleway.com"
+
+queue_url = os.getenv('QUEUE_URL') 
+sqs_access_key = os.getenv('SQS_ACCESS_KEY')
+sqs_secret_access_key = os.getenv('SQS_SECRET_ACCESS_KEY')
+
+def scrape_and_push():
+    """
+    Scrape the HN website for articles published in the last 15 minutes, and push infos on the SQS queue
+    """
+    page = requests.get(HN_URL)
+    html_doc = page.content
+
+    soup = BeautifulSoup(html_doc, 'html.parser')
+
+    # On hn news page there are exactly 30 articles, for each of them a `titleline` and a `age` span are present
+    titlelines = soup.find_all(class_="titleline")
+    ages = soup.find_all(class_="age")
+
+    sqs = boto3.client('sqs', endpoint_url=SCW_SQS_URL, aws_access_key_id=sqs_access_key, aws_secret_access_key=sqs_secret_access_key, region_name='fr-par')
+
+    for age, titleline in zip(ages, titlelines):
+        time_str = age["title"]
+        time = datetime.strptime(time_str, "%Y-%m-%dT%H:%M:%S")
+        # Check if article was published in the last 15 minutes
+        if datetime.utcnow() - time > timedelta(minutes=15):
+            continue
+
+        body = json.dumps({'url': titleline.a["href"], 'title': titleline.a.get_text()})
+        response = sqs.send_message(QueueUrl=queue_url, MessageBody=body)
+        
+    return page.status_code
+
+def handle(event, context):
+    try:
+        status = scrape_and_push()
+        return {'statusCode': status, 'headers': {'content': 'text/plain'}}
+    except Exception as e:
+        print(e)
+        return {'statusCode': 500, 'body': str(e)}
+
+if __name__ == "__main__":
+    handle(None, None)
diff --git a/projects/serverless-scraping/scraper/requirements.txt b/projects/serverless-scraping/scraper/requirements.txt
@@ -0,0 +1,3 @@
+boto3
+bs4
+requests
diff --git a/projects/serverless-scraping/terraform/main.tf b/projects/serverless-scraping/terraform/main.tf

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Ignore everything in this directory except this file`
	`2`	`+!.gitignore`