From 6b1b34d1da6b0db599c026e17df011ad6c6b3a30 Mon Sep 17 00:00:00 2001 From: Peter Stephenson Date: Fri, 1 Dec 2006 10:23:06 +0000 Subject: c.f. 23023: new calendar function system. --- Functions/Calendar/calendar_scandate | 519 +++++++++++++++++++++++++++++++++++ 1 file changed, 519 insertions(+) create mode 100644 Functions/Calendar/calendar_scandate (limited to 'Functions/Calendar/calendar_scandate') diff --git a/Functions/Calendar/calendar_scandate b/Functions/Calendar/calendar_scandate new file mode 100644 index 000000000..f0024b89a --- /dev/null +++ b/Functions/Calendar/calendar_scandate @@ -0,0 +1,519 @@ +# Scan a line for various common date and time formats. +# Set REPLY to the number of seconds since the epoch at which that +# time occurs. The time does not need to be matched; this will +# produce midnight at the start of the date. +# +# Absolute times +# +# The rules below are fairly complicated, to allow any natural (and +# some highly unnatural but nonetheless common) combination of +# time and date used by English speakers. It is recommended that, +# rather than exploring the intricacies of the system, users find +# a date format that is natural to them and stick to it. This +# will avoid unexpected effects. Various key facts should be noted, +# explained in more detail below: +# +# - In particular, note the confusion between month/day/year and +# day/month/year when the month is numeric; this format should be +# avoided if at all possible. Many alternatives are available. +# - However, there is currently no localization support, so month +# names must be English (though only the first three letters are required). +# The same applies to days of the week if they occur (they are not useful). +# - The year must be given in full to avoid confusion, and only years +# from 1900 to 2099 inclusive are matched. +# - Although timezones are parsed (complicated formats may not be recognized), +# they are then ignored; no time adjustment is made. +# +# The following give some obvious examples; users finding here +# a format they like and not subject to vagaries of style may skip +# the full description. As dates and times are matched separately +# (even though the time may be embedded in the date), any date format +# may be mixed with any format for the time of day provide the +# separators are clear (whitespace, colons, commas). +# 2007/04/03 13:13 +# 2007/04/03:13:13 +# 2007/04/03 1:13 pm +# 3rd April 2007, 13:13 +# April 3rd 2007 1:13 p.m. +# Apr 3, 2007 13:13 +# Tue Apr 03 13:13:00 2007 +# 13:13 2007/apr/3 +# +# Times are parsed and extracted before dates. They must use colons +# to separate hours and minutes, though a dot is allowed before seconds +# if they are present. This limits time formats to +# HH:MM[:SS[.FFFFF]] [am|pm|a.m.|p.m.] +# HH:MM.SS[.FFFFF] [am|pm|a.m.|p.m.] +# in which square brackets indicate optional elements, possibly with +# alternatives. Fractions of a second are recognised but ignored. +# Unless -r is given (see below), a date is mandatory but a time of day is +# not; the time returned is at the start of the date. +# +# Time zones are not handled, though if one is matched following a time +# specification it will be removed to allow a surrounding date to be +# parsed. This only happens if the format of the timezone is not too +# wacky: +# +0100 +# GMT +# GMT-7 +# CET+1CDT +# etc. are all understood, but any part of the timezone that is not numeric +# must have exactly three capital letters in the name. +# +# Dates suffer from the ambiguity between DD/MM/YYYY and MM/DD/YYYY. It is +# recommended this form is avoided with purely numeric dates, but use of +# ordinals, eg. 3rd/04/2007, will resolve the ambiguity as the ordinal is +# always parsed as the day of the month. Years must be four digits (and +# the first two must be 19 or 20); 03/04/08 is not recognised. Other +# numbers may have leading zeroes, but they are not required. The +# following are handled: +# YYYY/MM/DD +# YYYY-MM-DD +# YYYY/MNM/DD +# YYYY-MNM-DD +# DD[th|st|rd] MNM[,] YYYY +# DD[th|st|rd] MNM[,] current year assumed +# MNM DD[th|st|rd][,] YYYY +# MNM DD[th|st|rd][,] current year assumed +# DD[th|st|rd]/MM[,] YYYY +# DD[th|st|rd]/MM/YYYY +# MM/DD[th|st|rd][,] YYYY +# MM/DD[th|st|rd]/YYYY +# Here, MNM is at least the first three letters of a month name, +# matched case-insensitively. The remainder of the month name may appear but +# its contents are irrelevant, so janissary, febrile, martial, apricot, +# etc. are happily handled. +# +# Note there are only two cases that assume the current year, the +# form "Jun 20" or "14 September" (the only two commonly occurring +# forms, apart from a "the" in some forms of English, which isn't +# currently supported). Such dates will of course become ambiguous +# in the future, so should ideally be avoided. +# +# Times may follow dates with a colon, e.g. 1965/07/12:09:45; this +# is in order to provide a format with no whitespace. A comma +# and whitespace are allowed, e.g. "1965/07/12, 09:45". +# Currently the order of these separators is not checked, so +# illogical formats such as "1965/07/12, : ,09:45" will also +# be matched. Otherwise, a time is only recognised as being associated +# with a date if there is only whitespace in between, or if the time +# was embedded in the date. +# +# Days of the week are not scanned, but will be ignored if they occur +# at the start of the date pattern only. +# +# For example, the standard date format: +# Fri Aug 18 17:00:48 BST 2006 +# is handled by matching HH:MM:SS and removing it together with the +# matched (but unused) time zone. This leaves the following: +# Fri Aug 18 2006 +# "Fri" is ignored and the rest is matched according to the sixth of +# the standard rules. +# +# Relative times +# ============== +# +# The option -r allows a relative time. Years (or ys, yrs, or without s), +# months (or mths, mons, mnths, months, or without s --- "m", "ms" and +# "mns" are ambiguous and are not handled), weeks (or ws, wks, or without +# s) and days (or ds, dys, days, or without s), hours (or hs, hrs, with or +# without s), minutes (or mins, with or without s) and seconds (or ss, +# secs, with or without s) are understood. Spaces between the numbers +# are optional, but are required between items, although a comma +# may be used (with or without spaces). +# +# Note that a year here is 365.25 days and a month is 30 days. TODO: +# improve this by passing down base time and adjusting. (This will +# be crucial for events repeating monthly.) TODO: it then makes +# sense to make PERIODly = 1 PERIOD (also for PERIOD = dai!) +# +# This allows forms like: +# 30 years 3 months 4 days 3:42:41 +# 14 days 5 hours +# 4d,10hr +# In this case absolute dates are ignored. + +emulate -L zsh +setopt extendedglob + +zmodload -i zsh/datetime || return 1 + +# separator characters before time or between time and date +# allow , - or : before the time: this allows spaceless but still +# relatively logical dates like 2006/09/19:14:27 +# don't allow / before time ! the above +# is not 19 hours 14 mins and 27 seconds after anything. +local tschars="[-,:[:space:]]" +# start pattern for time when anchored +local tspat_anchor="(${tschars}#)" +# ... when not anchored +local tspat_noanchor="(|*${tschars})" +# separator characters between elements. comma is fairly +# natural punctuation; otherwise only allow whitespace. +local schars="[.,[:space:]]" +local daypat="${schars}#(sun|mon|tue|wed|thu|fri|sat)[a-z]#${schars}#" +# Start pattern for date: treat , as space for simplicity. This +# is illogical at the start but saves lots of minor fiddling later. +# Date start pattern when anchored at the start. +# We need to be able to ignore the day here, although (for consistency +# with the unanchored case) we don't remove it until later. +# (The problem in the other case is that matching anything before +# the day of the week is greedy, so the day of the week gets ignored +# if it's optional.) +local dspat_anchor="(|(#B)${daypat}(#b)${schars}#)" +# Date start pattern when not anchored at the start. +local dspat_noanchor="(|*${schars})" +# end pattern for relative times: similar remark about use of $schars. +local repat="(|s)(|${schars}*)" +# not locale-dependent! I don't know how to get the months out +# of the system for the purpose of finding out where they occur. +# We may need some completely different heuristic. +local monthpat="(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]#" +# days, not handled but we need to ignore them. also not localized. + +integer year month day hour minute second +local opt line orig_line mname MATCH MBEGIN MEND tz +local -a match mbegin mend +# Flags that we found a date or a time (maybe a relative time) +integer date_found time_found +# Indices of positions of start and end of time and dates found. +# These are actual character indices as zsh would normally use, i.e. +# line[time_start,time_end] is the string for the time. +integer time_start time_end date_start date_end +integer anchor anchor_end debug relative reladd setvar + +while getopts "aAdrs" opt; do + case $opt in + (a) + # anchor + (( anchor = 1 )) + ;; + + (A) + # anchor at end, too + (( anchor = 1, anchor_end = 1 )) + ;; + + (d) + # enable debug output + (( debug = 1 )) + ;; + + (r) + (( relative = 1 )) + ;; + + (s) + (( setvar = 1 )) + ;; + + (*) + return 1 + ;; + esac +done +shift $(( OPTIND - 1 )) + +line=$1 orig_line=$1 + +local dspat tspat +if (( anchor )); then + # Anchored at the start. + dspat=$dspat_anchor + if (( relative )); then + tspat=$tspat_anchor + else + # We'll test later if the time is associated with the date. + tspat=$tspat_noanchor + fi +else + dspat=$dspat_noanchor + tspat=$tspat_noanchor +fi + +# Look for a time separately; we need colons for this. +case $line in + # with seconds, am/pm: don't match / in front. + ((#ibm)${~tspat}(<0-12>):(<0-59>)[.:]((<0-59>)(.<->|))[[:space:]]#([ap])(|.)[[:space:]]#m(.|[[:space:]]|(#e))(*)) + hour=$match[2] + minute=$match[3] + second=$match[5] + [[ $match[7] = (#i)p ]] && (( hour <= 12 )) && (( hour += 12 )) + time_found=1 + ;; + + # no seconds, am/pm + ((#ibm)${~tspat}(<0-12>):(<0-59>)[[:space:]]#([ap])(|.)[[:space:]]#m(.|[[:space:]]|(#e))(*)) + hour=$match[2] + minute=$match[3] + [[ $match[4] = (#i)p ]] && (( hour <= 12 )) && (( hour += 12 )) + time_found=1 + ;; + + # no colon, even, but a.m./p.m. indicator + ((#ibm)${~tspat}(<0-12>)[[:space:]]#([ap])(|.)[[:space:]]#m(.|[[:space:]]|(#e))(*)) + hour=$match[2] + minute=0 + [[ $match[3] = (#i)p ]] && (( hour <= 12 )) && (( hour += 12 )) + time_found=1 + ;; + + # 24 hour clock, with seconds + ((#ibm)${~tspat}(<0-24>):(<0-59>)[.:]((<0-59>)(.<->|))(*)) + hour=$match[2] + minute=$match[3] + second=$match[5] + time_found=1 + ;; + + # 24 hour clock, no seconds + ((#ibm)${~tspat}(<0-24>):(<0-59>)(*)) + hour=$match[2] + minute=$match[3] + time_found=1 + ;; +esac + +(( hour == 24 )) && hour=0 + +if (( time_found )); then + # time was found + time_start=$mbegin[2] + time_end=$mend[-2] + # Remove the timespec because it may be in the middle of + # the date (as in the output of "date". + # There may be a time zone, too, which we don't yet handle. + # (It's not in POSIX strptime() and libraries don't support it well.) + # This attempts to remove some of the weirder forms. + if [[ $line[$time_end+1,-1] = (#b)[[:space:]]#([A-Z][A-Z][A-Z]|[-+][0-9][0-9][0-9][0-9])([[:space:]]|(#e))* || \ + $line[$time_end+1,-1] = (#b)[[:space:]]#([A-Z][A-Z][A-Z](|[-+])<0-12>)([[:space:]]|(#e))* || \ + $line[$time_end+1,-1] = (#b)[[:space:]]#([A-Z][A-Z][A-Z](|[-+])<0-12>[A-Z][A-Z][A-Z])([[:space:]]|(#e))* ]]; then + (( time_end += ${mend[-1]} )) + tz=$match[1] + fi + line=$line[1,time_start-1]$line[time_end+1,-1] + (( debug )) && print "line after time: $line" +fi + +if (( relative == 0 )); then + # Date. + case $line in + # Look for YEAR[-/.]MONTH[-/.]DAY + ((#bi)${~dspat}((19|20)[0-9][0-9])[-/](<1-12>)[-/](<1-31>)*) + year=$match[2] + month=$match[4] + day=$match[5] + date_start=$mbegin[2] date_end=$mend[5] + date_found=1 + ;; + + # Same with month name + ((#bi)${~dspat}((19|20)[0-9][0-9])[-/]${~monthpat}[-/](<1-31>)*) + year=$match[2] + mname=$match[4] + day=$match[5] + date_start=$mbegin[2] date_end=$mend[5] + date_found=1 + ;; + + # Look for DAY[th/st/rd] MNAME[,] YEAR + ((#bi)${~dspat}(<1-31>)(|th|st|rd)[[:space:]]##${~monthpat}(|,)[[:space:]]##((19|20)[0-9][0-9])*) + day=$match[2] + mname=$match[4] + year=$match[6] + date_start=$mbegin[2] date_end=$mend[6] + date_found=1 + ;; + + # Look for MNAME DAY[th/st/rd][,] YEAR + ((#bi)${~dspat}${~monthpat}[[:space:]]##(<1-31>)(|th|st|rd)(|,)[[:space:]]##((19|20)[0-9][0-9])*) + mname=$match[2] + day=$match[3] + year=$match[6] + date_start=$mbegin[2] date_end=$mend[6] + date_found=1 + ;; + + # Look for DAY[th/st/rd] MNAME; assume current year + ((#bi)${~dspat}(<1-31>)(|th|st|rd)[[:space:]]##${~monthpat}(|,)([[:space:]]##*|)) + day=$match[2] + mname=$match[4] + strftime -s year "%Y" $EPOCHSECONDS + date_start=$mbegin[2] date_end=$mend[5] + date_found=1 + ;; + + # Look for MNAME DAY[th/st/rd]; assume current year + ((#bi)${~dspat}${~monthpat}[[:space:]]##(<1-31>)(|th|st|rd)(|,)([[:space:]]##*|)) + mname=$match[2] + day=$match[3] + strftime -s year "%Y" $EPOCHSECONDS + date_start=$mbegin[2] date_end=$mend[5] + date_found=1 + ;; + + # Now it gets a bit ambiguous. + # Look for DAY[th/st/rd][/]MONTH[/ ,]YEAR + ((#bi)${~dspat}(<1-31>)(|th|st|rd)/(<1-12>)((|,)[[:space:]]##|/)((19|20)[0-9][0-9])*) + day=$match[2] + month=$match[4] + year=$match[7] + date_start=$mbegin[2] date_end=$mend[7] + date_found=1 + ;; + + # Look for MONTH[/]DAY[th/st/rd][/ ,]YEAR + ((#bi)${~dspat}(<1-12>)/(<1-31>)(|th|st|rd)((|,)[[:space:]]##|/)((19|20)[0-9][0-9])*) + month=$match[2] + day=$match[3] + year=$match[7] + date_start=$mbegin[2] date_end=$mend[7] + date_found=1 + ;; + esac +fi + +if (( date_found )); then + # date found + # see if there's a day at the start + if [[ ${line[1,$date_start-1]} = (#bi)${~daypat} ]]; then + date_start=$mbegin[1] + fi + line=${line[1,$date_start-1]}${line[$date_end+1,-1]} + if (( time_found )); then + # If we found a time, it must be associated with the date, + # or we can't use it. Since we removed the time from the + # string to find the date, however, it's complicated to + # know where both were found. Reconstruct the date indices of + # the original string. + if (( time_start <= date_start )); then + # Time came before start of date; add length in. + (( date_start += time_end - time_start + 1 )) + fi + if (( time_start <= date_end )); then + (( date_end += time_end - time_start + 1 )) + fi + + if (( time_end + 1 < date_start )); then + # If time wholly before date, OK if only separator characters + # in between. (This allows some illogical stuff with commas + # but that's probably not important.) + if [[ ${orig_line[time_end+1,date_start-1]} != ${~schars}# ]]; then + # Clearly this can't work if anchor is set. In principle, + # we could match the date and ignore the time if it wasn't. + # However, that seems dodgy. + return 1 + else + # Form massaged line by removing the entire date/time chunk. + line="${orig_line[1,time_start-1]}${orig_line[date_end+1,-1]}" + fi + elif (( date_end + 1 < time_start )); then + # If date wholly before time, OK if only time separator characters + # in between. This allows 2006/10/12:13:43 etc. + if [[ ${orig_line[date_end+1,time_start-1]} != ${~tschars}# ]]; then + # Here, we assume the time is associated with something later + # in the line. This is pretty much inevitable for the sort + # of use we are expecting. For example, + # 2006/10/24 Meeting from early, may go on till 12:00. + # or with some uses of the calendar system, + # 2006/10/24 MR 1 Another pointless meeting WARN 01:00 + # The 01:00 says warn an hour before, not that the meeting starts + # at 1 am. About the only safe way round would be to force + # a time to be present, but that's not how the traditional + # calendar programme works. + # + # Hence we need to reconstruct. + (( time_found = 0, hour = 0, minute = 0, second = 0 )) + line="${orig_line[1,date_start-1]}${orig_line[date_end+1,-1]}" + else + # As above. + line="${orig_line[1,date_start-1]}${orig_line[time_end+1,-1]}" + fi + fi + if (( debug )); then + print "Time string: $time_start,$time_end:" \ + "'$orig_line[time_start,time_end]'" + print "Date string: $date_start,$date_end:" \ + "'$orig_line[date_start,date_end]'" + print "Remaining line: '$line'" + fi + fi +fi + +if (( relative )); then + if [[ $line = (#bi)${~dspat}(<->)[[:blank:]]#(y|yr|year)${~repat} ]]; then + (( reladd += ((365*4+1) * 24 * 60 * 60 * ${match[2]} + 1) / 4 )) + line=${line[1,$mbegin[2]-1]}${line[$mend[4]+1,-1]} + time_found=1 + fi + if [[ $line = (#bi)${~dspat}(<->)[[:blank:]]#(mth|mon|mnth|month)${~repat} ]]; then + (( reladd += 30 * 24 * 60 * 60 * ${match[2]} )) + line=${line[1,$mbegin[2]-1]}${line[$mend[4]+1,-1]} + time_found=1 + fi + if [[ $line = (#bi)${~dspat}(<->)[[:blank:]]#(w|wk|week)${~repat} ]]; then + (( reladd += 7 * 24 * 60 * 60 * ${match[2]} )) + line=${line[1,$mbegin[2]-1]}${line[$mend[4]+1,-1]} + time_found=1 + fi + if [[ $line = (#bi)${~dspat}(<->)[[:blank:]]#(d|dy|day)${~repat} ]]; then + (( reladd += 24 * 60 * 60 * ${match[2]} )) + line=${line[1,$mbegin[2]-1]}${line[$mend[4]+1,-1]} + time_found=1 + fi + if [[ $line = (#bi)${~dspat}(<->)[[:blank:]]#(h|hr|hour)${~repat} ]]; then + (( reladd += 60 * 60 * ${match[2]} )) + line=${line[1,$mbegin[2]-1]}${line[$mend[4]+1,-1]} + time_found=1 + fi + if [[ $line = (#bi)${~dspat}(<->)[[:blank:]]#(min|minute)${~repat} ]]; then + (( reladd += 60 * ${match[2]} )) + line=${line[1,$mbegin[2]-1]}${line[$mend[4]+1,-1]} + time_found=1 + fi + if [[ $line = (#bi)${~dspat}(<->)[[:blank:]]#(s|sec|second)${~repat} ]]; then + (( reladd += ${match[2]} )) + line=${line[1,$mbegin[2]-1]}${line[$mend[4]+1,-1]} + time_found=1 + fi +fi + +if (( relative )); then + # If no date was found, we're in trouble unless we found a time. + if (( time_found )); then + if (( anchor_end )); then + # must be left with only separator characters + if [[ $line != ${~schars}# ]]; then + return 1 + fi + fi + (( REPLY = reladd + (hour * 60 + minute) * 60 + second )) + [[ -n $setvar ]] && REPLY2=$line + return 0 + fi + return 1 +elif (( ! date_found )); then + return 1 +fi + +if (( anchor_end )); then + # must be left with only separator characters + if [[ $line != ${~schars}# ]]; then + return 1 + fi +fi + +local fmt nums +if [[ -n $mname ]]; then + fmt="%Y %b %d %H %M %S" + nums="$year $mname $day $hour $minute $second" +else + fmt="%Y %m %d %H %M %S" + nums="$year $month $day $hour $minute $second" +fi + +strftime -s REPLY -r $fmt $nums + +[[ -n $setvar ]] && REPLY2=$line + +return 0 -- cgit 1.4.1