diff lisp/packages/man.el @ 10:49a24b4fd526 r19-15b6

Import from CVS: tag r19-15b6
author cvs
date Mon, 13 Aug 2007 08:47:52 +0200
parents 4b173ad71786
children 0293115a14e9
line wrap: on
line diff
--- a/lisp/packages/man.el	Mon Aug 13 08:47:36 2007 +0200
+++ b/lisp/packages/man.el	Mon Aug 13 08:47:52 2007 +0200
@@ -170,6 +170,72 @@
 imposes a large startup cost which is why it is not simply on by
 default on all systems.")
 
+(defvar Manual-use-rosetta-man (not (null (locate-file "rman" exec-path))) "\
+If non-nil, use RosettaMan (rman) to filter man pages.
+This makes man-page cleanup virtually instantaneous, instead of
+potentially taking a long time.
+
+Here is information on RosettaMan, from Neal.Becker@comsat.com (Neal Becker):
+
+RosettaMan is a filter for UNIX manual pages.  It takes as input man
+pages formatted for a variety of UNIX flavors (not [tn]roff source)
+and produces as output a variety of file formats.  Currently
+RosettaMan accepts man pages as formatted by the following flavors of
+UNIX: Hewlett-Packard HP-UX, AT&T System V, SunOS, Sun Solaris, OSF/1,
+DEC Ultrix, SGI IRIX, Linux, SCO; and produces output for the following
+formats: printable ASCII only (stripping page headers and footers),
+section and subsection headers only, TkMan, [tn]roff, Ensemble, RTF,
+SGML (soon--I finally found a DTD), HTML, MIME, LaTeX, LaTeX 2e, Perl 5's pod.
+
+RosettaMan improves on other man page filters in several ways: (1) its
+analysis recognizes the structural pieces of man pages, enabling high
+quality output, (2) its modular structure permits easy augmentation of
+output formats, (3) it accepts man pages formatted with the varient
+macros of many different flavors of UNIX, and (4) it doesn't require
+modification or cooperation with any other program.
+
+RosettaMan is a rewrite of TkMan's man page filter, called bs2tk.  (If
+you haven't heard about TkMan, a hypertext man page browser, you
+should grab it via anonymous ftp from ftp.cs.berkeley.edu:
+/ucb/people/phelps/tkman.tar.Z.)  Whereas bs2tk generated output only for
+TkMan, RosettaMan generalizes the process so that the analysis can be
+leveraged to new output formats.  A single analysis engine recognizes
+section heads, subsection heads, body text, lists, references to other
+man pages, boldface, italics, bold italics, special characters (like
+bullets), tables (to a degree) and strips out page headers and
+footers.  The engine sends signals to the selected output functions so
+that an enhancement in the engine improves the quality of output of
+all of them.  Output format functions are easy to add, and thus far
+average about about 75 lines of C code each.
+
+
+
+*** NOTES ON CURRENT VERSION ***
+
+Help!  I'm looking for people to help with the following projects.
+\(1) Better RTF output format.  The current one works, but could be
+made better.  (2) Roff macros that produce text that is easily
+parsable.  RosettaMan handles a great variety, but some things, like
+H-P's tables, are intractable.  If you write an output format or
+otherwise improve RosettaMan, please send in your code so that I may
+share the wealth in future releases.
+
+This version can try to identify tables (turn this on with the -T
+switch) by looking for lines with a large amount of interword spacing,
+reasoning that this is space between columns of a table.  This
+heuristic doesn't always work and sometimes misidentifies ordinary
+text as tables.  In general I think it is impossible to perfectly
+identify tables from nroff formatted text.  However, I do think the
+heuristics can be tuned, so if you have a collection of manual pages
+with unrecognized tables, send me the lot, in formatted form (i.e.,
+after formatting with nroff -man), and uuencode them to preserve the
+control characters.  Better, if you can think of heuristics that
+distinguish tables from ordinary text, I'd like to hear them.
+
+
+Notes for HTML consumers: This filter does real (heuristic)
+parsing--no <PRE>!  Man page references are turned into hypertext links.")
+
 (make-face 'man-italic)
 (or (face-differs-from-default-p 'man-italic)
     (copy-face 'italic 'man-italic))
@@ -780,45 +846,48 @@
 ;; Hint: BS stands form more things than "back space"
 (defun Manual-nuke-nroff-bs (&optional apropos-mode)
   (interactive "*")
-  ;;
-  ;; turn underlining into italics
-  ;;
-  (goto-char (point-min))
-  (while (search-forward "_\b" nil t)
-    ;; searching for underscore-backspace and then comparing the following
-    ;; chars until the sequence ends turns out to be much faster than searching
-    ;; for a regexp which matches the whole sequence.
-    (let ((s (match-beginning 0)))
-      (goto-char s)
-      (while (and (= (following-char) ?_)
-		  (= (char-after (1+ (point))) ?\b))
-	(Manual-delete-char 2)
-	(forward-char 1))
-      (set-extent-face (make-extent s (point)) 'man-italic)))
-  ;;
-  ;; turn overstriking into bold
-  ;;
-  (goto-char (point-min))
-  (while (re-search-forward "\\([^\n]\\)\\(\b\\1\\)" nil t)
-    ;; Surprisingly, searching for the above regexp is faster than searching
-    ;; for a backspace and then comparing the preceding and following chars,
-    ;; I presume because there are many false matches, meaning more funcalls
-    ;; to re-search-forward.
-    (let ((s (match-beginning 0)))
-      (goto-char s)
-      ;; Some systems (SGI) overstrike multiple times, eg, "M\bM\bM\bM".
-      (while (looking-at "\\([^\n]\\)\\(\b\\1\\)+")
-	(delete-region (+ (point) 1) (match-end 0))
-	(forward-char 1))
-      (set-extent-face (make-extent s (point)) 'man-bold)))
-  ;;
-  ;; hack bullets: o^H+ --> +
-  (goto-char (point-min))
-  (while (search-forward "\b" nil t)
-    (Manual-delete-char -2))
+  (if Manual-use-rosetta-man
+      (call-process-region (point-min) (point-max) "rman" t t nil)
+    ;;
+    ;; turn underlining into italics
+    ;;
+    (goto-char (point-min))
+    (while (search-forward "_\b" nil t)
+      ;; searching for underscore-backspace and then comparing the following
+      ;; chars until the sequence ends turns out to be much faster than searching
+      ;; for a regexp which matches the whole sequence.
+      (let ((s (match-beginning 0)))
+	(goto-char s)
+	(while (and (= (following-char) ?_)
+		    (= (char-after (1+ (point))) ?\b))
+	  (Manual-delete-char 2)
+	  (forward-char 1))
+	(set-extent-face (make-extent s (point)) 'man-italic)))
+    ;;
+    ;; turn overstriking into bold
+    ;;
+    (goto-char (point-min))
+    (while (re-search-forward "\\([^\n]\\)\\(\b\\1\\)" nil t)
+      ;; Surprisingly, searching for the above regexp is faster than searching
+      ;; for a backspace and then comparing the preceding and following chars,
+      ;; I presume because there are many false matches, meaning more funcalls
+      ;; to re-search-forward.
+      (let ((s (match-beginning 0)))
+	(goto-char s)
+	;; Some systems (SGI) overstrike multiple times, eg, "M\bM\bM\bM".
+	(while (looking-at "\\([^\n]\\)\\(\b\\1\\)+")
+	  (delete-region (+ (point) 1) (match-end 0))
+	  (forward-char 1))
+	(set-extent-face (make-extent s (point)) 'man-bold)))
+    ;;
+    ;; hack bullets: o^H+ --> +
+    (goto-char (point-min))
+    (while (search-forward "\b" nil t)
+      (Manual-delete-char -2))
 
-  (if (> (buffer-size) 100) ; minor kludge
-      (Manual-nuke-nroff-bs-footers))
+    (if (> (buffer-size) 100) ; minor kludge
+	(Manual-nuke-nroff-bs-footers))
+    ) ;; not Manual-use-rosetta-man
   ;;
   ;; turn subsection header lines into bold
   ;;
@@ -850,12 +919,14 @@
       (forward-line 1))
     )
 
-  ;; Zap ESC7,  ESC8, and ESC9
-  ;; This is for Sun man pages like "man 1 csh"
-  (goto-char (point-min))
-  (while (re-search-forward "\e[789]" nil t)
-    (replace-match ""))
-  
+  (if Manual-use-rosetta-man
+      nil
+    ;; Zap ESC7,  ESC8, and ESC9
+    ;; This is for Sun man pages like "man 1 csh"
+    (goto-char (point-min))
+    (while (re-search-forward "\e[789]" nil t)
+      (replace-match "")))
+
   ;; Nuke blanks lines at start.
   ;;  (goto-char (point-min))
   ;;  (skip-chars-forward "\n")