changed to more object-y style

2022-04-05 21:02:35 -04:00
parent bf8d251c21
commit 64888cea1e
6 changed files with 340 additions and 266 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
-save/
+*/
 temp/
 config.json
-
+savefile.json
--- a/autotoot.py
+++ b/autotoot.py
@@ -1,269 +1,45 @@
-from mastodon import Mastodon
+from bot import bot
-import praw
+from scraper import scraper
 import requests
 import os
 import time
 import json
-import logging
+import time
 '''
 TODO:
-  for deployment:
+    done:
    - [x] Keep track of what has been scraped and tooted to not duplicate posts
      - [x] Download and posting of video files
      - [x] Make sure text-only reddit posts work
      - [x] Eternal looping, run script every 5 mins or something
      - [x] Different masto post structures for different post types (videos need links)
  extras:
      - [x] Import bot/scraper settings from file for automation
      - [x] Random post if low activity
    likely:
      - […] Keep track of what has been scraped and tooted to not duplicate posts
      - […] Separate methdos methods to make code cleaner
      - […] Debugging logging
      - [ ] Move all vars into config
      - [ ] Docker image
    unlikely:
      - [ ] Updating from @mention toot
    - [ ] Improve debugging logging
       - [ ] Info logging
       - [ ] Error logging
    - [ ] Add twitter bot
    - [ ] Docker image?do
      - [ ] Make this an installable (pip?) package
      - [ ] Add twitter bot
 '''
-# 
+def run(masto, service):
-
+    # post any new posts, up to limit
-# Mastodon bot to post things
+    print("Scraping")
-class bot():
+    subs = service.scrape_all()
-    def __init__(self, config, debug=False):
+    print("Tooting if necessary")
        self.debug = debug
        self.masto = Mastodon(access_token=config["mastodon"]["access_token"], api_base_url=config["mastodon"]["host"])
    # uploads media to mastodon, returns the mastodon ID
    # specify mimetype of video files as "video/mp4" to avoid error
    def upload_media(self, filename, mimetype=None):
        if self.debug: logging.info(f"Uploading media {filename}")
        return self.masto.media_post(filename, mime_type=mimetype)
    # uploads all given media
    def upload_all_media(self, filenames):
        ids = []
        for fn in filenames:
            ids.append(self.upload_media(fn))
        return ids
    def toot(self, text, media=None):
        if self.debug: logging.info(f"Posting:\n  Text: {text}\n  Media: {', '.join(media) if media != None else 'None'}")
        self.masto.status_post(text, media_ids=media)
 # Reddit (maybe more in future) scaper to get postsn future) scaper to get posts
 # parameters:
 #   service: one of ["reddit"]
 #   config: dict of config variables
 class scraper():
    def __init__(self, service, config, debug=False):
        # dev
        console = logging.StreamHandler()
        console.setLevel(logging.INFO)
        logging.getLogger().addHandler(console)
        self.current_services = ["reddit"]
        # error checkitootng
        if service.lower() not in self.current_services:
            logging.error("Service invalid")
            return None
        # login to service
        if service == "reddit":
            self.login = praw.Reddit(
                client_id=config["reddit"]["client_id"],
                client_secret=config["reddit"]["client_secret"],
                password=config["reddit"]["password"],
                user_agent=config["reddit"]["user_agent"],
                username=config["reddit"]["username"])
        # make sure necessary filestructure is in place
        needed_directories = ["temp", "save", f"save/{service}"]
        for d in needed_directories:
            if not os.path.isdir(d): os.mkdir(d)
        if not os.path.exists(f"save/{service}"):
            open(f"save/{service}", "w+")
            f.close()
        # set object variables
        self.service = service
        self.debug = debug
        self.places = config[service]["places"]
        # seent it list is a little more complicated
        self.seent = {}
        for f in os.listdir(f"save/{service}"):
            savefile = open(f"save/{service}/{f}", "r").read().split("\n")
            self.seent[f.split("/")[-1]] = [item for item in savefile] # dict faster
    ### HELPER METHODS
    # helper method to clean out folder (delete all contents)
    # expected structure: [["temp/a/1", "temp/a/2"], [], [], ["temp/e/1"]]
    def remove_folders(self, folders_list):
        for folder in folders_list:
            if self.debug: logging.info(f"Clearing folder {folder}")
            for file in folder:
                os.remove(file)
            if len(folder) > 0:
                subfolder = "/".join(folder[0].split("/")[:-1])
                os.rmdir(subfolder)
    # helper method to download media
    def download_media(self, url, filename):
        # get file first
        if self.debug: logging.info(f"Downloading {url} info {filename}")
        resp = requests.get(url)
        if resp.ok:
            # make sure directory structure exists
            structure = filename.split("/")
            for i in range(len(structure[:-1])):
                d = "/".join(structure[0:i+1])
                if not os.path.isdir(d): os.mkdir(d)
            # write the downloaded content to file
            with open(filename, "wb+") as f:
                f.write(resp.content)
    # reddit helper method to return the post type
    def get_post_type(self, post):
        if post.url[8] == 'i': return "image"
        if post.url[8] == 'v': return "video"
        if post.url[23:30] == "gallery": return "gallery"
        return "unknown"
    # helper to save a list with a limit to a savefile
    def create_savefile(self, places, limit):
        # write to seent list memory and return posts
        for place in places:
            if self.debug: logging.info(f"Creating savefile save/{self.service}/{place}")
            new_seent = [k for k in self.seent[place] if k != ""]
            if len(new_seent) > limit: new_seent = new_seent[:limit]
            open(f"save/{self.service}/{place}", "w").write("\n".join(new_seent))
    ### REDDIT METHODS
    # gets posts from a given subreddit
    def reddit_scrape(self, sub_name, limit):
        # make sure seent list can store files for this sub
        if sub_name not in self.seent:
            self.seent[sub_name] = []
        if not os.path.exists(f"save/{self.service}/{sub_name}"):
            f = open(f"save/{self.service}/{sub_name}", "w+")
            f.close()
        # get posts that aren't in seent list
        post_list = []
        for p in self.login.subreddit(sub_name).new(limit=limit):
            if p.id not in self.seent[sub_name]:
                if self.debug: logging.info(f"Scraping post {p.id}")
                post_list.append(p)
                self.seent[sub_name] = [p.id] + self.seent[sub_name]
        return post_list
    # gets posts form all subreddits
    def reddit_scrape_all(self, sub_names, limit):
        subposts = {}
        for sub in sub_names:
            subposts[sub] = self.reddit_scrape(sub, limit)
        return subposts
    # downloads a given post; media is stored in temp/post_id/n
    # returns a list of the stored file locations for that post
    def reddit_download(self, post):
        def make_gallery_urls():
            nonlocal post
            urls = []
            for m in post.media_metadata:
                mimetype = post.media_metadata[m]["m"]
                end = mimetype[mimetype.find("/")+1:]
                urls.append(f"https://i.redd.it/{m}.{end}")
            return urls
        # get the media URLs in array
        reddit_urls = []
        post_type = self.get_post_type(post)
        if post_type == "image":
            reddit_urls = [post.url]
        elif post_type == "video":
            raw_url = post.media["reddit_video"]["fallback_url"]
            reddit_urls = [raw_url[:raw_url.find("?")]]
        elif post_type == "gallery":
            reddit_urls = make_gallery_urls()
        # download all media
        local_urls = []
        i = 0
        for url in reddit_urls:
            i += 1
            name = f"temp/{post.id}/{i}"
            if self.debug: logging.info(f"Downloading {url} ({i}/{len(reddit_urls)})")
            self.download_media(url, name)
            local_urls.append(name)
        return local_urls
    # uses reddit_download to get all posts' media in a list of posts
    # takes a list of posts, not a list of subs
    # returns a list of lists, one list per post containing the local download locations for that post
    def reddit_download_all(self, posts):
        image_locations = []
        for post in posts:
            image_locations.append(self.download(post))
        return image_locations
    ### WRAPPER METHODS; these should be the ones called directly
    # gets posts from a given service's place (ie, a subreddit or twitter feed)
    def scrape(self, place, limit=10):
        if self.debug: logging.info(f"Scraping {self.service}: {place}... ")
        if self.service == "reddit":
            result = self.reddit_scrape(place, limit)
        if self.debug: logging.info(f"Done scraping {self.service}: {place}.")
        return result
    # gets posts from a gives service's places (ie, multiple subreddits or feeds)
    def scrape_all(self, places=None, limit=10):
        if places == None: places = self.places
        if self.service == "reddit":
            result = self.reddit_scrape_all(places, limit)
        return result
    # downloads a given post's media and return the locations
    def download(self, post):
        if self.service == "reddit":
            if self.debug: logging.info(f"Downloading {post.id}... ")
            result = self.reddit_download(post)
        if self.debug: logging.info(f"Done downloading {post.id}.")
        return result
    # downloads a list of post's media and returns a list of the locations
    def download_all(self, posts):
        if self.service == "reddit":
            post_ids = [p.id for p in posts]
            result = self.reddit_download_all(posts)
        return result
    # creates the savefile for a list of posts.
    def remember(self, places=None, limit=10):
        if places == None: places = self.places
        if self.debug: logging.info(f"Remembering {', '.join(places)}...")
        self.create_savefile(places, limit)
        if self.debug: logging.info(f"Remembered {', '.join(places)}.")
    ### TOOTER METHODS (reddit only for now)
    # builds a toot for convenience
    def build_toot(self, masto, post):
        toot = {}
        toot["text"] = post.title
        if self.get_post_type(post) == "video": toot["text"] += f"\n\n{post.url}"
        local_media = self.download(post)
        toot["media"] = masto.upload_all_media(local_media)
        return toot
    # toots all posts in list
    def toot_posts(self, masto, posts):
        for post in posts:
            to_toot = self.build_toot(masto, post)
            masto.toot(to_toot["text"], to_toot["media"])
        return True
    ### RUNNING METHODS
    def run(self, masto, places=None, limit=10):
        if self.debug: logging.info(f"Running {self.service}.")
        if places == None: places = self.places
        subs = self.scrape_all(places=places, limit=limit)
    for sub in subs:
-            self.toot_posts(masto, subs[sub])
+        print(f"  Tooting {sub}")
-        self.remember()
+        service.toot_posts(masto, subs[sub])
    print("Remembering")
    service.remember()
    # post random if it has been a while
    print("Keeping lively")
    service.keep_lively()
 def main():
    while True:
@@ -271,12 +47,11 @@ def main():
        config = json.load(open('config.json', 'r'))
        # make bots
        masto = bot(config)
-        reddit = scraper("reddit", config, debug=True)
+        reddit = scraper("reddit", config, low_activity_random=True)
        # run bots
-        reddit.run(masto)
+        run(masto, reddit)
-        # buffer time bc posts only happen so often
+        # buffer time bc posts only happen so often so why check
-        time.sleep(60)
+        time.sleep(5)
 if __name__ == "__main__":
    main()
 scrape_all
--- a/bot.py
+++ b/bot.py
@@ -0,0 +1,25 @@
 from mastodon import Mastodon
 import logging
 # Mastodon bot to post things
 class bot():
    def __init__(self, config, debug=False):
        self.debug = debug
        self.masto = Mastodon(access_token=config["mastodon"]["access_token"], api_base_url=config["mastodon"]["host"])
    # uploads media to mastodon, returns the mastodon ID
    # specify mimetype of video files as "video/mp4" to avoid error
    def upload_media(self, filename, mimetype=None):
        logging.info(f"Uploading media {filename}")
        return self.masto.media_post(filename, mime_type=mimetype)
    # uploads all given media
    def upload_all_media(self, filenames):
        ids = []
        for fn in filenames:
            ids.append(self.upload_media(fn))
        return ids
    def toot(self, text, media=None):
        logging.info(f"Posting:\n  Text: {text}")
        print("self.masto.status_post(text, media_ids=media)")
--- a/helper.py
+++ b/helper.py
@@ -0,0 +1,71 @@
 import requests
 import os
 import logging
 from datetime import datetime
 ### HELPER METHODS
 # helper method to clean out folder (delete all contents)
 # expected structure: [["temp/a/1", "temp/a/2"], [], [], ["temp/e/1"]]
 class helper():
    def __init__(service):
        # copy the service's variables to make them local
        # because it's easier to access and doesn't requre the
        # service to pass itself in every time
        service = service.service
        low_activity_random = service.low_activity_random
        debug = service.debug
        places = service.places
        seent = service.seent
    def remove_folders(folders_list):
        for folder in folders_list:
            logging.info(f"Clearing folder {folder}")
            for file in folder:
                os.remove(file)
            if len(folder) > 0:
                subfolder = "/".join(folder[0].split("/")[:-1])
                os.rmdir(subfolder)
    # helper method to download media
    def download_media(url, filename):
        # get file first
        logging.info(f"Downloading {url} info {filename}")
        resp = requests.get(url)
        if resp.ok:
            # make sure directory structure exists
            structure = filename.split("/")
            for i in range(len(structure[:-1])):
                d = "/".join(structure[0:i+1])
                if not os.path.isdir(d): os.mkdir(d)
            # write the downloaded content to file
            with open(filename, "wb+") as f:
                f.write(resp.content)
    # reddit helper method to return the post type
    def get_post_type(post):
        print(post.url)
        if post.url[8] == 'i': return "image"
        if post.url[8] == 'v': return "video"
        if post.url[23:30] == "gallery": return "gallery"
        return "unknown"
    # returns True if the ts1 is older than ts2
    # tsx should be a timestamp value
    def ts_older(ts1, ts2):
        # timedelta of `hours`
        hours_delta = datetime.fromtimestamp(ts2) - datetime.fromtimestamp(0)
        # timedelta of timestamp
        stamp_delta = datetime.fromtimestamp(ts1)
        stamp_delta = datetime.now() - stamp_delta
        return stamp_delta > hours_delta
    # returns True if place hasn't had a post in the past 12 hours according
    # to the savefile
    def been_awhile(seent_time, hours=12):
        long_time = 60 * 60 * hours
        return helper.ts_older(int(seent_time), long_time)
    # takes in a ListingGenerator (list of reddit posts) and 
    # reverses it
    def reddit_listify(LG):
        return [p for p in LG]
--- a/reddit.py
+++ b/reddit.py
@@ -0,0 +1,129 @@
 from helper import helper
 import praw
 import json
 import time
 import logging
 class reddit_scraper:
    def __init__(self, config):
        self.login = praw.Reddit(
                client_id=config["reddit"]["client_id"],
                client_secret=config["reddit"]["client_secret"],
                password=config["reddit"]["password"],
                user_agent=config["reddit"]["user_agent"],
                username=config["reddit"]["username"])
        self.places = config["reddit"]["places"]
        savefile = open("savefile.json", "r")
        savefile = json.load(savefile)
        try: self.seent = savefile["reddit"]
        except: self.seent = {}
    ### REDDIT METHODS
    # gets posts from a given subreddit
    def scrape(self, sub, limit):
        # make sure self.seent has the sub, add if not
        if sub not in self.seent: self.seent[sub] = time.time()
        # get posts that aren't in seent list
        post_list = []
        posts = self.login.subreddit(sub).new(limit=limit)
        posts = helper.reddit_listify(posts)
        for p in posts[::-1]:
            if helper.ts_older(p.created, self.seent[sub]):
                break
            logging.info(f"Scraping post {p.id}")
            post_list.append(p)
            self.seent[sub] = p.created
        return post_list
    # scrapes all subreddits
    def scrape_all(self, limit):
        subposts = {}
        for place in self.places:
            subposts[place] = self.scrape(place, limit)
        return subposts
    # downloads a given post; media is stored in temp/post_id/n
    # returns a list of the stored file locations for that post
    def download(self, post):
        def make_gallery_urls():
            nonlocal post
            urls = []
            for m in post.media_metadata:
                mimetype = post.media_metadata[m]["m"]
                end = mimetype[mimetype.find("/")+1:]
                urls.append(f"https://i.redd.it/{m}.{end}")
            return urls
        # video is sketchy, sorta WIP but maybe impossible
        # to have consistently. this function does its best
        def try_video_urls(post):
            try:
                raw_url = post.media["video"]["fallback_url"]
                return [raw_url[:raw_url.find("?")]]
            except:
                try:
                    raw_url = post.media["reddit_video"]["fallback_url"]
                    return [raw_url[:raw_url.find("?")]]
                except:
                    return []
            return [] # should never be reached but just in case
        # get the media URLs in array
        urls = []
        post_type = helper.get_post_type(post)
        if post_type == "image":
            urls = [post.url]
        elif post_type == "video":
            urls = try_video_urls(post)
        elif post_type == "gallery":
            urls = make_gallery_urls()
        urls = [] # nueter download
        # download all media
        local_urls = []
        i = 0
        for url in urls:
            i += 1
            name = f"temp/{post.id}/{i}"
            logging.info(f"Downloading {url} ({i}/{len(urls)})")
            helper.download_media(url, name)
            local_urls.append(name)
        return local_urls
    # posts if it's been a while. checks each sub and 
    def keep_lively(self):
        for sub in self.places:
            if helper.been_awhile(self.seent[sub]):
                self.random_post(sub)
    # gets a random post from reddit
    def random_post(self, place):
        return self.login.subreddit(place).random()
    # creates the savefile for a list of posts.
    def remember(self):
        print(f"{self.seent}")
        savefile = json.load(open("savefile.json", "r"))
        savefile["reddit"] = self.seent
        savefile = json.dumps(savefile)
        with open("savefile.json", "w") as f:
            f.write(savefile)
    ### TOOTER METHODS
    # takes a toot and returns a dict of the text and media IDs
    def build_toot(self, masto, post):
        toot = {}
        toot["text"] = post.title
        if helper.get_post_type(post) == "video": toot["text"] += f"\n\n{post.url}"
        local_media = self.download(post)
        toot["media"] = masto.upload_all_media(local_media)
        return toot
    # toots all posts in list
    def toot_posts(self, masto, posts):
        for post in posts:
            to_toot = self.build_toot(masto, post)
            masto.toot(to_toot["text"], to_toot["media"])
        return True
--- a/scraper.py
+++ b/scraper.py
@@ -0,0 +1,75 @@
 import os
 import logging
 import json
 from reddit import reddit_scraper as reddit
 class scraper:
    def __init__(self, service, config, low_activity_random=False):
        # error checking
        scrapers = ["reddit"]
        if service.lower() not in scrapers:
            logging.error(f"Scraper {service} invalid. Choose one of {', '.join(scrapers)}")
            return None
        # make sure necessary filestructure is in place
        if not os.path.isdir("temp"): os.mkdir("temp")
        if not os.path.exists("savefile.json"):
            f = open("savefile.json", "w+")
            f.write("{}")
        # set object variables
        self.low_activity_random = low_activity_random
        self.service = service
        # login to service
        if service == "reddit": self.login = reddit(config)
    ### WRAPPER METHODS
    def scrape(self, place, limit=10):
        logging.info(f"Scraping {self.service}: {place}... ")
        result = self.login.scrape(place, limit)
        logging.info(f"Done scraping {self.service}: {place}.")
        return result
    # gets posts from a gives service's places (ie, multiple subreddits or feeds)
    def scrape_all(self, limit=10):
        return self.login.scrape_all(limit)
    # downloads a given post's media and return the locations
    def download(self, post):
        logging.info(f"Downloading {post.id}... ")
        result = self.login.download(post)
        logging.info(f"Done downloading {post.id}.")
        return result
    # downloads a list of post's media and returns a list of the locations
    def download_all(self, posts):
        post_ids = [p.id for p in posts]
        locations = []
        for post in post_ids:
            locations.append(self.login.download(post))
        return locations
    # creates the savefile for a list of posts.
    def remember(self):
        logging.info(f"Remembering {self.service}...")
        self.login.remember()
        logging.info(f"Remembered {self.service}.")
    # posts for each place if it has been a while
    def keep_lively(self):
        self.login.keep_lively()
    # posts a random post from the given place
    def random_post(self, place):
        logging.info(f"Getting random post for {place}")
        return self.login.random_post(place)
    ### TOOTER METHODS
    # takes a toot and returns a dict of the text and media IDs
    def build_toot(self, masto, post):
        return self.login.build_toot(masto, post)
    # toots all posts in list
    def toot_posts(self, masto, posts):
        for post in posts:
            to_toot = self.build_toot(masto, post)
            masto.toot(to_toot["text"], to_toot["media"])
        return True