From 1a62190fed26c6ed637904d60eadf824ade7112e Mon Sep 17 00:00:00 2001 From: David Ebbo Date: Thu, 20 Jun 2024 15:15:53 +0200 Subject: [PATCH] Misc wiki image fixes: - New flag to allow skipping images - Allow no artist if license is valid - Fix support for RGBA png files --- .../Utilities/get_wiki_images.py | 66 +++++++++++-------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/OZprivate/ServerScripts/Utilities/get_wiki_images.py b/OZprivate/ServerScripts/Utilities/get_wiki_images.py index 62ad044c..f50d24c7 100644 --- a/OZprivate/ServerScripts/Utilities/get_wiki_images.py +++ b/OZprivate/ServerScripts/Utilities/get_wiki_images.py @@ -207,9 +207,12 @@ def get_image_license_info(escaped_image_name): image_metadata = r.json() extmetadata = image_metadata["query"]["pages"]["-1"]["imageinfo"][0]["extmetadata"] - license_info["artist"] = extmetadata["Artist"]["value"] - # Strip the html tags from the artist - license_info["artist"] = re.sub(r'<[^>]*>', '', license_info["artist"]) + if "artist" in extmetadata: + license_info["artist"] = extmetadata["Artist"]["value"] + # Strip the html tags from the artist + license_info["artist"] = re.sub(r'<[^>]*>', '', license_info["artist"]) + else: + license_info["artist"] = "Unknown artist" license_info["license"] = extmetadata["License"]["value"] @@ -293,6 +296,9 @@ def save_wiki_image_for_qid(ott, qid, image, src, rating, output_dir, check_if_u # Crop and resize the image using PIL im = Image.open(uncropped_image_path) + # Convert to RGB to avoid issues with transparency when working with a png file + if im.mode in ("RGBA", "P"): + im = im.convert("RGB") im = im.resize( (300, 300), box = (crop_box.x, crop_box.y, crop_box.x + crop_box.width, crop_box.y + crop_box.height) @@ -354,7 +360,7 @@ def save_all_wiki_vernaculars_for_qid(ott, qid, vernaculars_by_language): db_connection.commit() -def process_leaf(ott_or_taxon, image_name=None, rating=None): +def process_leaf(ott_or_taxon, image_name, rating, skip_images): # If ott_or_taxon is a number, it's an ott. Otherwise, it's a taxon name. sql = "SELECT ott,wikidata,name FROM ordered_leaves WHERE " if ott_or_taxon.isnumeric(): @@ -378,21 +384,23 @@ def process_leaf(ott_or_taxon, image_name=None, rating=None): if not rating: rating = 40000 if image_name else 35000 - # If a specific image name is passed in, use it. Otherwise, we need to look it up. - # Also, if an image is passed in, we categorize it as a bespoke image, not wiki. json_item = get_wikidata_json_for_qid(qid) - if image_name: - image = { "name": image_name } - src = src_flags['onezoom_bespoke'] - else: - image = get_preferred_or_first_image_from_json_item(json_item) - src = src_flags['wiki'] - save_wiki_image_for_qid(ott, qid, image, src, rating, output_dir) + + if not skip_images: + # If a specific image name is passed in, use it. Otherwise, we need to look it up. + # Also, if an image is passed in, we categorize it as a bespoke image, not wiki. + if image_name: + image = { "name": image_name } + src = src_flags['onezoom_bespoke'] + else: + image = get_preferred_or_first_image_from_json_item(json_item) + src = src_flags['wiki'] + save_wiki_image_for_qid(ott, qid, image, src, rating, output_dir) vernaculars_by_language = get_vernaculars_by_language_from_json_item(json_item) save_all_wiki_vernaculars_for_qid(ott, qid, vernaculars_by_language) -def process_clade(ott_or_taxon, dump_file): +def process_clade(ott_or_taxon, dump_file, skip_images): # Get the left and right leaf ids for the passed in taxon sql = "SELECT ott,name,leaf_lft,leaf_rgt FROM ordered_nodes WHERE " @@ -408,15 +416,16 @@ def process_clade(ott_or_taxon, dump_file): logger.error(f"'{ott_or_taxon}' not found in ordered_nodes table") return - # Find all the leaves in the clade that don't have wiki images (ignoring images from other sources) - sql = """ - SELECT wikidata, ordered_leaves.ott FROM ordered_leaves - LEFT OUTER JOIN (SELECT ott,src,url FROM images_by_ott WHERE src={}) as wiki_images_by_ott ON ordered_leaves.ott=wiki_images_by_ott.ott - WHERE url IS NULL AND ordered_leaves.id >= {} AND ordered_leaves.id <= {}; - """.format(subs, subs, subs) - db_curs.execute(sql, (src_flags['wiki'], leaf_left, leaf_right)) - leaves_without_images = dict(db_curs.fetchall()) - logger.info(f"Found {len(leaves_without_images)} taxa without an image in the database") + if not skip_images: + # Find all the leaves in the clade that don't have wiki images (ignoring images from other sources) + sql = """ + SELECT wikidata, ordered_leaves.ott FROM ordered_leaves + LEFT OUTER JOIN (SELECT ott,src,url FROM images_by_ott WHERE src={}) as wiki_images_by_ott ON ordered_leaves.ott=wiki_images_by_ott.ott + WHERE url IS NULL AND ordered_leaves.id >= {} AND ordered_leaves.id <= {}; + """.format(subs, subs, subs) + db_curs.execute(sql, (src_flags['wiki'], leaf_left, leaf_right)) + leaves_without_images = dict(db_curs.fetchall()) + logger.info(f"Found {len(leaves_without_images)} taxa without an image in the database") # Find all the leaves in the clade that don't have wiki vernaculars (ignoring vernaculars from other sources) sql = """ @@ -429,7 +438,7 @@ def process_clade(ott_or_taxon, dump_file): logger.info(f"Found {len(leaves_without_vernaculars)} taxa without a vernacular in the database") for qid, image, vernaculars in enumerate_dump_items_with_images_or_vernaculars(dump_file): - if image and qid in leaves_without_images: + if not skip_images and image and qid in leaves_without_images: ott = leaves_without_images[qid] save_wiki_image_for_qid(ott, qid, image, src_flags['wiki'], 35000, output_dir, check_if_up_to_date=False) if vernaculars and qid in leaves_without_vernaculars: @@ -448,8 +457,9 @@ def main(): subparsers = parser.add_subparsers(help='help for subcommand', dest="subcommand") def add_common_args(parser): - parser.add_argument('--config_file', default=None, help='The configuration file to use. If not given, defaults to private/appconfig.ini') - parser.add_argument('--output_dir', '-o', default=None, help="The location to save the cropped pictures (e.g. 'FinalOutputs/img'). If not given, defaults to ../../../static/FinalOutputs/img (relative to the script location). Files will be saved under output_dir/{src_flag}/{3-digits}/fn.jpg") + parser.add_argument('--config-file', default=None, help='The configuration file to use. If not given, defaults to private/appconfig.ini') + parser.add_argument('--output-dir', '-o', default=None, help="The location to save the cropped pictures (e.g. 'FinalOutputs/img'). If not given, defaults to ../../../static/FinalOutputs/img (relative to the script location). Files will be saved under output_dir/{src_flag}/{3-digits}/fn.jpg") + parser.add_argument('--skip-images', action='store_true', help='Only process vernaculars, not images') parser_leaf = subparsers.add_parser('leaf', help='Process a single ott') parser_leaf.add_argument('ott_or_taxon', type=str, help='The leaf ott or taxon to process') @@ -483,10 +493,10 @@ def add_common_args(parser): if args.subcommand == "leaf": # Process one leaf, optionally forcing the specified image - process_leaf(args.ott_or_taxon, args.image, args.rating) + process_leaf(args.ott_or_taxon, args.image, args.rating, args.skip_images) elif args.subcommand == "clade": # Process all the images in the passed in clade - process_clade(args.ott_or_taxon, args.dump_file) + process_clade(args.ott_or_taxon, args.dump_file, args.skip_images) else: parser.print_help()