[Fencommits] fenfire-hs: irc2rdf fixes: unicode, turtle literals, dateTime vs. date

Tuukka Hastrup Tuukka.Hastrup at iki.fi
Mon Mar 12 05:23:09 EET 2007


Mon Mar 12 05:21:42 EET 2007  Tuukka Hastrup <Tuukka.Hastrup at iki.fi>
  * irc2rdf fixes: unicode, turtle literals, dateTime vs. date
diff -rN -u old-fenfire-hs/Irc2RDF.hs new-fenfire-hs/Irc2RDF.hs
--- old-fenfire-hs/Irc2RDF.hs	2007-03-12 05:23:09.000000000 +0200
+++ new-fenfire-hs/Irc2RDF.hs	2007-03-12 05:23:09.000000000 +0200
@@ -1,4 +1,4 @@
-
+{-# OPTIONS_GHC -fffi #-}
 module Irc2RDF where
 
 -- Copyright (c) 2006-2007, Benja Fallenstein, Tuukka Hastrup
@@ -25,12 +25,53 @@
 import System.IO.Unsafe (unsafeInterleaveIO)
 
 import Data.Char (toUpper, toLower)
+import Data.Char (ord)
+import Data.Bits ((.&.))
+
+import qualified Control.Exception
+
+import System.Glib.UTFString (newUTFString, readCString, 
+                              peekUTFString)
+import System.Glib.FFI (withCString, nullPtr, CString, CInt, Ptr)
+import System.IO.Unsafe (unsafePerformIO)
+
+foreign import ccall "g_utf8_validate" valid :: CString -> CInt -> 
+                                                Ptr (CString) -> Bool
+
+-- XXX real toUTF isn't exported from System.Glib.UTFString
+toUTF :: String -> String
+toUTF s = unsafePerformIO $ newUTFString s >>= readCString
+
+fromUTF :: String -> String
+fromUTF s = unsafePerformIO $ Control.Exception.catch 
+    (withCString s $ \cstr -> peekUTFString cstr >>= \s' -> 
+     if (valid cstr (-1) nullPtr) then return s' -- force any exceptions
+                           else return s )
+    (\_e -> return s)                 -- if any, keep the local encoding
+
+-- from gutf8.c used in g_utf8_validate
+isUnicode c' = let c = ord c' in
+     c < 0x110000 &&
+     c .&. 0xFFFFF800 /= 0xD800 &&
+     (c < 0xFDD0 || c > 0xFDEF) &&
+     c .&. 0xFFFE /= 0xFFFE
+
+-- XXX which unicode characters must be escaped?
+turtle_escaped :: Char -> String -> String
+turtle_escaped _        [] = []
+turtle_escaped c ('\\':xs) = '\\':'\\':turtle_escaped c xs
+turtle_escaped c    (x:xs) | c == x 
+                           = '\\':   c:turtle_escaped c xs
+turtle_escaped c ('\n':xs) = '\\': 'n':turtle_escaped c xs
+turtle_escaped c ('\r':xs) = '\\': 'r':turtle_escaped c xs
+turtle_escaped c ('\t':xs) = '\\': 't':turtle_escaped c xs
+turtle_escaped c (   x:xs) =         x:turtle_escaped c xs
 
 main = do [root,filepath] <- getArgs
           'h':'t':'t':'p':':':'/':'/':_ <- return root
           irc <- getContents
           timestamps <- getTimeStamps
-          mapM_ (uncurry $ handle root filepath) $ zip (lines irc) 
+          mapM_ (uncurry $ handle root filepath) $ zip (map fromUTF$lines irc)
                                                        (uniquify timestamps)
 
 getTimeStamps = do ~(TOD secs _picos) <- unsafeInterleaveIO getClockTime
@@ -49,7 +90,7 @@
 handle :: String -> FilePath -> String -> (ClockTime, Maybe Integer) -> IO ()
 handle root filepath line (clockTime,offset) = do 
     let (file,output) = irc2rdf root filepath (clockTime,offset) line
-    maybe (return ()) ((flip appendFile) output) file
+    maybe (return ()) ((flip appendFile) (toUTF output)) file
 
 irc2rdf :: String -> FilePath -> (ClockTime, Maybe Integer) -> String ->
            (Maybe FilePath,String)
@@ -69,7 +110,8 @@
            Maybe String -> [String] -> (Maybe FilePath, String)
 triples root filepath (time,offset) (Just prefix) [cmd,target,msg] 
     | map toUpper cmd == "PRIVMSG", 
-      '#':channel <- map toLower target, channel `elem` ["fenfire","swig"]
+      '#':channel <- map toLower target, channel `elem` ["fenfire","sioc",
+                                                         "swig","haskell"]
     = 
     let file = channel ++ "-" ++ day
         uri = root ++ file ++ "#" ++ second ++ maybe "" (('.':) . show) offset
@@ -80,21 +122,22 @@
     "<irc://freenode/%23"++channel++"> <"++isContainerOf++"> <"++uri++">.\n"++
     "<irc://freenode/%23"++channel++"> <"++rdftype++"> <"++forum++">.\n"++
     "<"++uri++"> <"++created++"> "++
-        show (day++"T"++second++"Z")++"^^<"++date++">.\n"++
+        t (day++"T"++second++"Z")++"^^<"++date++">.\n"++
     "<"++uri++"> <"++hasCreator++"> <"++creator++">.\n"++
-    "<"++uri++"> <"++hasContent++"> "++show msg++".\n"++
-    "<"++uri++"> <"++label++"> "++show ("<"++nick++"> "++msg)++".\n"++
+    "<"++uri++"> <"++hasContent++"> "++t msg++".\n"++
+    "<"++uri++"> <"++label++"> "++t ("<"++nick++"> "++msg)++".\n"++
     "<"++uri++"> <"++rdftype++"> <"++post++">.\n"++
-    "<"++creator++"> <"++label++"> "++show nick++".\n"++
+    "<"++creator++"> <"++label++"> "++t nick++".\n"++
     "<"++creator++"> <"++rdftype++"> <"++user++">.\n"
     )
-    where label = "http://www.w3.org/2000/01/rdf-schema#label"
+    where t str = "\"" ++ turtle_escaped '\"' str ++ "\""
+          label = "http://www.w3.org/2000/01/rdf-schema#label"
           rdftype = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
           created = "http://purl.org/dc/terms/created"
           isContainerOf = "http://rdfs.org/sioc/ns#is_container_of"
           hasCreator = "http://rdfs.org/sioc/ns#has_creator"
           hasContent = "http://rdfs.org/sioc/ns#has_content"
-          date = "http://www.w3.org/2001/XMLSchema#date"
+          date = "http://www.w3.org/2001/XMLSchema#dateTime"
           forum = "http://rdfs.org/sioc/ns#Forum"
           post = "http://rdfs.org/sioc/ns#Post"
           user = "http://rdfs.org/sioc/ns#User"




More information about the Fencommits mailing list