#!/usr/bin/python # # imgsizer -- correct image sizes in WWW pages # by Eric S. Raymond # # Fix up IMG tags in given documents to contain correct sizes. # # Works with Python 1.5.2 # # Copy, use, and redistribute freely, but don't take my name off it and # clearly mark an altered version. Fixes and enhancements cheerfully # accepted. # # Changelog: # # Originally created by Eric S. Raymond 30 Jul 1996 # # Modified by Erik Rossen 15 May 1999 # # Added the --nomagick switch, to use file(1) and rdjpgcom(1) # to determine the image size instead of identify(1) from the # ImageMagick suite. # # Modified by Michael C. Toren 18 Aug 2000 # # Fixed bug where the SRC attribute's value needed to be in quotes, # improved command line parsing (but it could still use some work), # added -q switch to omit quotes when generating tags, and -l switch # to generate lowercase tags. -mct # # Modified by Michael C. Toren 19 Aug 2000 # # Improved the command line parsing some more, now looks for additional # arguments via an IMGSIZER environmental variable, added the -d switch # to set the DocumentRoot, -v switch to display version information, # and -h switch to display usage information. -mct # # Modified by Michael C. Toren 23 Feb 2001 # # Fixed two bugs reported by Jeroen Valcke , one # where the -d switch did not function properly if the img src attribute # was quoted, and another where the &error sub was incorrectly reporting # the line number an error occurred due to the input record separator # being set to ">". # # Rewritten in Python by Eric S. Raymond 11 July 2001 # # Time to get rid of the dependency on httpget. The -l option is gone, too; # instead, we deduce the right case by looking at the leading tag. -q # is gone; we always emit without quotes. -m is gone too, instead we # try commands in least to most expensive order, and notice when a command # returns not to try it again. # # Fixes by ESR, 29 July 2001 # # Incorporated fixes by Peter S. Galbraith. # # Fixes by ESR, 25 April 2003 # # Merged amended versions of Lennart Poettering's fix for Debian bug 139714. # and Jeroen N. Witmond's fix for Debian bug 168964. Added regression-test # production. # # Enhancement by ESR, 14 Nov 2003 # # Verify and merge Lucien Saviot's patch to produce XHTML from XHTML input. # Also his change to handle spurious lin e breaks produced by Dave Raggett's # tidy(1) utility. # # Modified by Andrew Gwozdziewycz , 17 June 2004 # # Added support for the Python Imaging Library to determine size in case of # failure from file(1), rdjpgcom(1) and identify(1). import sys, os, getopt, string, re, urllib, commands # Arrange for both 1.5 and 2.1 compatibility try: import filecmp cmp = filecmp del filecmp except ImportError: import cmp version = "2.7, 05 Aug 2004"; splash = """imgsizer version %s, Eric S. Raymond See for updates.""" usage = """Usage: imgsizer [OPTIONS] [HTML File] Options: -V, --version Display version information and exit. -h, --help Display usage information. -d , --document-root Directory where absolute image filenames (i.e, ones which contain a leading "/") may be found. -n, --no-overwrite Don't overwrite existing width and height tags if both are present. """ # Optimization latches -- if an attempt to invoke a command returns 127 # "not found" there will turn off and that command won't be tried again. magick = 1 # using ImageMagick by default rdjpgcom = 1 # using rdjpcom by default pythonimage = 1 # use python imaging library def attrformat(xc, dim): if lower: res = " " + dim else: res = " " + string.upper(dim) res = res + '="' + str(xc) + '"' return res def sizefix(infp, outfp): # Apply attrfix to the attributes in each image tag global lower while 1: ch = infp.read(1) if ch == '': return outfp.write(ch) if ch == '<': # within an HTML tag lead = infp.read(2) outfp.write(lead) if not lead in ("im", "IM"): continue # splitting the read this way copes with single-char tags like lead = lead + infp.read(1) outfp.write(lead[-1]) if not lead in ("img", "IMG"): continue # within an image tag lower = (lead == 'img') state = suppress = 0 attributes = "" while 1: ch = infp.read(1) if ch == '': return if ch == '>': break if ch == '/': ch2 = infp.read(1) ch = ch + ch2 if ch2 == '>': break attributes = attributes + ch outfp.write(transform(attributes) + ch) x_match = re.compile (r" ([0-9]+) *x *([0-9]+)") rdjpg_match = re.compile (r" ([0-9]+)w *\* *([0-9]+)h") def imgsize(src): "Return the image size in pixels for a given image source." global magick, rdjpgcom, pythonimage try: (filename, headers) = urllib.urlretrieve(src) except IOError: return None # Now let's see if we can get a size for the retrieved image. # Try file(1) first -- cheapest, as it doesn't read the whole image (status, output) = commands.getstatusoutput("file " + filename) if status == 0: # file(1) works for every common image format other than JPEG if string.find(output, "JPEG") == -1: sizes = x_match.search(output) if sizes: return (sizes.group(1), sizes.group(2)) elif rdjpgcom: # Use rdjpgcom(1) to handle JPEGs (status, output) = commands.getstatusoutput("rdjpgcom -verbose " + filename) sizes = rdjpg_match.search(output) if sizes: return (sizes.group(1), sizes.group(2)) elif status == 127: rdjpgcom = 0 # Next try identify(1), more expensive but bulletproof if magick: (status, output) = commands.getstatusoutput("identify " + filename) if status == 0: sizes = x_match.search(output) if sizes: return (sizes.group(1), sizes.group(2)) elif status == 127: sys.stderr.write("imgsizer: giving up on ImageMagick\n") magick = 0 # if that fails, try at _LAST_ resort Python Imaging Library # open doesn't actually load all the data, so it shouldn't be too expensive if pythonimage: try: import Image pyimg = Image.open(filename) return pyimg.size except (ImportError, IOError): sys.stderr.write("imgsizer: giving up on Python Imaging Library\n") pythonimage = 0 pass # All attempts failed sys.stderr.write("imgsizer: couldn't analyze %s\n" % src) source = re.compile('SRC\s*=\s*"?([^" \t\n]*)"?', re.I) awidth = re.compile(r' *WIDTH\s*=\s*"?[0-9]*"?', re.I) aheight = re.compile(r' *HEIGHT\s*=\s*"?[0-9]*"?', re.I) pwidth = re.compile(r'WIDTH\s*=\s*"?[0-9]*%"?', re.I) pheight = re.compile(r'HEIGHT\s*=\s*"?[0-9]*%"?', re.I) def transform(attr): src = source.search(attr) # Must have a source part and no percents in existing width or height if not src or pwidth.search(attr) or pheight.search(attr): return attr if no_overwrite and awidth.search(attr) and aheight.search(attr): return attr # Correct the url for documentation root, if present url = src.group(1) if url[0] == '/' and root: url = os.path.join(root, url[1:]) # OK, get the size tuple if possible dimensions = imgsize(url) if not dimensions: return attr else: # Nuke any old size attr if not no_overwrite: attr = re.sub(awidth, "", attr) attr = re.sub(aheight, "", attr) # Compute image dimensions (xc, yc) = dimensions # Plug in the new attr return attr + attrformat(xc, "width") + attrformat(yc, "height") # Output uppercase tags, surrounded by quotes, by default. lower = 0 quotes = 1 # Set the default DocumentRoot to the current working directory. root = "." out = "imgsizer-out$$" dir = "." # NOTE: if you are doing