Article 7717 of comp.lang.perl:
Xref: feenix.metronet.com comp.lang.perl:7717
Newsgroups: comp.lang.perl
Path: feenix.metronet.com!news.ecn.bgu.edu!usenet.ins.cwru.edu!howland.reston.ans.net!pipex!uunet!munnari.oz.au!metro!usage!news
From: cameron@cse.unsw.edu.au (Cameron Simpson)
Subject: Re: Stumped at To: header parsing
Message-ID: <cameron-931108235339-1-06469@fuligin>
To: shutton@copper.ucs.indiana.edu (Scott K. Hutton)
Followup-To: comp.lang.perl
Sender: news@usage.csd.unsw.OZ.AU
Nntp-Posting-Host: fuligin.spectrum.cs.unsw.oz.au
Reply-To: cameron@cse.unsw.edu.au
Organization: CS&E Computing Facility, Uni Of NSW, Oz
References: <CFw55t.BEp@usenet.ucs.indiana.edu>
Errors-To: cameron@cse.unsw.edu.au
Date: Mon, 8 Nov 1993 12:53:54 GMT
Return-Receipt-To: cameron@cse.unsw.edu.au
Lines: 330

| I'm at a loss for an efficient way to parse a nasty header in the form
| of:
| 
|   To: foobar@baz.com "Frank Oobar, Director", quux@biff.bitnet (BIFF)
| 
| The line needs to be split into its component addresses, but it should
| never be split on a comma that occurs within quotes or parens.  I
| can't figure out a pattern that will work for this and might be forced
| to scan the line for commas and figuring out if we're in quotes/parens
| or not.
| 
| Surely someone out there has already invented this wheel...
| 
| Be happy to share a smattering of other mail parsing code in return
| (or, for that matter, even if you can't help).  I'm working up a
| mail parsing package, since that's what I seem to do the most.
| 
|  -Scott

I append my rfc822.pl package. Let me know if it uses things I hacen't
included, or of any bugs you find. What you want are &parseaddrs and
&rawaddrs.
	- Cameron Simpson
	  cameron@cse.unsw.edu.au, DoD#743
--
Hacker: One who accidentally destroys.
Wizard: One who recovers afterwards.

#!/bin/sh
#

sed 's/^X//' > rfc822.pl <<'EOF-/home/cs/spectrum/fuligin/1/cameron/etc/pl/cs/rfc822.pl'
X#!/usr/local/bin/perl
X#
X# Code to support RFC822-style message headers.
X#
X# &clrhdrs
X#	Empty %'hdrs and @'hdrs.
X#
X# &hdrkey  Field name to key for array.
X# &hdrnorm Field name to output form (capitalise words).
X#
X# &hdr(name) -> @values or undef
X#	Returns text of header as list (if accomodating multiple entries)
X#	or the list joined with "\t\n" or ", " if in scalar context.
X#
X# &addhdrs(@message_lines)
X#	Extract leading header lines from @lines up to blank line if present
X#	and add to @'hdrs. Then rebuild %'hdrs to be the content of each line
X#	keyed by downcased field name (and _ -> -). Multiple bodies are joined
X#	by ", " if in $rfc822'listfieldptn or by "\t\n" otherwise. Returns the
X#	unprocessed lines.
X#
X# &delhdrs(@field_names)
X#	Remove all references to the specified headers from @'hdrs and %'hdrs.
X#
X# &synchdrs
X#	Rebuild %'hdrs from @'hdrs.
X#
X# &parseaddrs($addresslist) -> @(precomment, address, postcomment)
X#	Break comma separated address list into a list of tuples,
X#	being leading comment, address portion, trailing comment.
X#
X# &rawaddrs(@(pre,addr,post)) -> @addrs
X#	Extract the middle elements from a 3-tuple list.
X#
X# &msgid
X#	Generate a message-id for an article.
X#
X
Xpackage rfc822;
X
X@mailhdrs=('to','cc','bcc','from','sender','reply-to','return-receipt-to',
X	   'errors-to');
X@newshdrs=('newsgroups','followup-to');
X$mailptn=join('|',@mailhdrs);
X$newsptn=join('|',@newshdrs);
X
X@listhdrs=(@mailhdrs,@newshdrs,'keywords');
X$listfieldptn=join('|',@listhdrs);
X
X&clrhdrs;
X
Xsub clrhdrs
X	{ undef %'hdrs, @'hdrs;
X	  $synced=1;
X	}
X
Xsub hdr
X	{ local($_)=@_;
X
X	  $_=&hdrkey($_);
X
X	  &synchdrs;
X
X	  return undef unless defined($'hdrs{$_});
X
X	  return $'hdrs{$_} unless wantarray;
X
X	  local(@bodies)=();
X
X	  for $hdr (@'hdrs)
X		{ push(@bodies,$') if $hdr =~ /^$_:\s*/;
X		}
X
X	  @bodies;
X	}
X
X# add a line to @hdrs and %hdrs
X# just adds to @hdrs until it gets "" or "\n" and then sets up %hdrs
X# field names matching $commaptn are concatenated with commas,
X# otherwise with "\n\t".
Xsub addhdrs	# @lines -> @remaining_lines
X	{ local($commaptn)=$listfieldptn;
X	  local($_,$hdr);
X
X	  $hdr='';
X	  while (defined($_=shift))
X		{ s/\r?\n$//;
X		  last if !length;
X
X		  if (/^\s/)
X			{ $hdr.="\n$_";
X			}
X		  else
X		  { length($hdr) && push(@'hdrs,$hdr);
X		    $hdr=$_;
X		  }
X		}
X
X	  length($hdr) && push(@'hdrs,$hdr);
X
X	  $synced=0;
X
X	  @_;
X	}
X
X# delete all instances of specified headers
Xsub delhdrs	# (@fieldnames) -> void
X	{ local(@fields)=@_;
X	  local($field,@newhdrs);
X
X	  for $field (@fields)
X		{ $field=&hdrkey($field);
X		  @hdrs=eval 'grep(!/^$field:/o,@hdrs)';
X		}
X
X	  $synced=0;
X	}
X
X# replace header lines
Xsub rephdrs	# (@headerlines) -> void
X	{ local(@reps)=@_;
X	  local($_,$field);
X	
X	  for (@reps)
X		{ next unless /^([-\w]+):/;
X		  $field=&hdrkey($1);
X
X		  for $hdr (@'hdrs)
X			{ $hdr =~ s/^$field:/X-Original-$&/i;
X			}
X		}
X
X	  $synced=0;
X
X	  &addhdrs(@reps);
X	}
X
X# Get key from field.
Xsub hdrkey
X	{ local($_)=@_;
X	  tr/_A-Z/-a-z/;
X	  $_;
X	}
X
X# Get normal form of field name.
Xsub hdrnorm
X	{ local($_)=&hdrkey($_[0]);
X
X	  print STDERR "norm($_) -> ";
X	  s/\b[a-z]/\u$&/g;
X	  print STDERR "$_\n";
X	  $_;
X	}
X
X# Rebuild %'hdrs from @'hdrs.
Xsub synchdrs	# (void) -> (void)
X	{ return if $synced;
X
X	  local($key,$field,$_);
X
X	  undef %'hdrs;
X	  for (@'hdrs)
X		{ if (/^([^\s:]+):\s*/)
X			{ $key=$1;
X			  $field=$'; $field =~ s/^\s+//;
X			  $key=&hdrkey($key);
X			  if (defined($'hdrs{$key}))
X				{ if ($key =~ /^$commaptn$/o)
X					{ $'hdrs{$key}.=', ';
X					}
X				  else
X				  { $'hdrs{$key}.="\n\t";
X				  }
X
X				  $'hdrs{$key}.=$field;
X				}
X			  else
X			  { $'hdrs{$key}=$field;
X			  }
X			}
X		}
X
X	  $synced=1;
X	}
X
X# parse an RFC822 address list returning a list of tuples
X#	(leading comment, address, trailing comment, ...)
Xsub parseaddrs	# (addrlist) -> @(precomment, addr, postcomment)
X	{ local($_)=@_;
X	  local(@parsed);
X	  local($pre,$addr,$post)=('','','');
X
X	  s/^\s+//;
X	  while (length)
X		{ if (/^,/)
X			# end of currently building address
X			{ $_=$';
X			  if (length($pre) && !length($addr))
X				{ $addr=$pre; $pre='';
X				}
X
X			  if (length($pre) || length($addr) || length($post))
X				{ push(@parsed,$pre,$addr,$post);
X				}
X
X			  $pre='';
X			  $addr='';
X			  $post='';
X			}
X		  elsif (!length($addr) && /^[-\w_.]+(@[-\w_.]+)?/)
X			{ $_=$';
X			  $addr=$&; 
X			}
X		  elsif (/^"([^"]|\\")*"/ || /^'([^']|\\')*'/)
X			{ $_=$';
X			  if (length($addr))
X				{ $post .= " $&";
X				}
X			  else
X			  { $pre .= " $&";
X			  }
X			}
X		  elsif (/^<[^>\s]*>/)
X			{ $_=$';
X			  if (length($addr))
X				{ $pre.=' '.$addr;
X				  if (length($post))
X					{ $pre.=' '.$post;
X					  $post='';
X					}
X				}
X
X			  $addr=$&;
X			}
X		  elsif (/^[^,\s]+/)
X			{ $_=$';
X			  if (length($addr))
X				{ $post.=" $&";
X				}
X			  else
X			  { $pre.=" $&";
X			  }
X			}
X		  else
X		  { print STDERR "trouble parsing, remaining address is \"$_\"\n";
X		  }
X
X		  s/^\s+//;
X		}
X
X	  if (length($pre) && !length($addr))
X		{ $addr=$pre; $pre='';
X		}
X
X	  if (length($pre) || length($addr) || length($post))
X		{ push(@parsed,$pre,$addr,$post);
X		}
X
X	  for (@parsed)
X		{ s/^\s+//;
X		}
X
X	  @parsed;
X	}
X
X# strip out the raw addresses from the result of &parseaddrs
Xsub rawaddrs
X	{ local(@rawaddrs);
X	  local($_);
X
X	  while (defined($_=shift))
X		{ $_=shift;
X		  last if !defined;
X		  s/^<([^>\s]+)>$/$1/;
X		  push(@rawaddrs,$_);
X		  shift;
X		}
X
X	  @rawaddrs;
X	}
X
X$msgid_count=0;
Xsub msgid
X	{ local($sec,$min,$hour,$mday,$mon,$year,@etc)=localtime(time);
X
X	  $msgid_count++;
X	  sprintf("<%s-%02d%02d%02d%02d%02d%02d-%d-%05d@%s>",
X		$'USER,
X		$year,$mon+1,$mday,$hour,$min,$sec,
X		$msgid_count,
X		$$,
X		$'HOSTNAME);
X	}
X
X1;	# for require
EOF-/home/cs/spectrum/fuligin/1/cameron/etc/pl/cs/rfc822.pl

exit 0


