N-граммы в действии
This commit is contained in:
@@ -8,4 +8,7 @@ main =
|
|||||||
putStrLn "Введите имя файла:" >>
|
putStrLn "Введите имя файла:" >>
|
||||||
getLine >>= \fileName ->
|
getLine >>= \fileName ->
|
||||||
readFile fileName >>= \content ->
|
readFile fileName >>= \content ->
|
||||||
uprint $ splitText content
|
let sentences = splitText content in
|
||||||
|
uprint (take 10 sentences) >>
|
||||||
|
let dict = buildDictionary sentences in
|
||||||
|
saveDictionary "dict.txt" dict
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ description: Please see the README on GitHub at <https://github.com/gith
|
|||||||
|
|
||||||
dependencies:
|
dependencies:
|
||||||
- base >= 4.7 && < 5
|
- base >= 4.7 && < 5
|
||||||
|
- containers
|
||||||
- unescaping-print
|
- unescaping-print
|
||||||
|
|
||||||
ghc-options:
|
ghc-options:
|
||||||
|
|||||||
@@ -35,6 +35,7 @@ library
|
|||||||
ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints
|
ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints
|
||||||
build-depends:
|
build-depends:
|
||||||
base >=4.7 && <5
|
base >=4.7 && <5
|
||||||
|
, containers
|
||||||
, unescaping-print
|
, unescaping-print
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
@@ -49,6 +50,7 @@ executable part2-exe
|
|||||||
ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints -threaded -rtsopts -with-rtsopts=-N
|
ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints -threaded -rtsopts -with-rtsopts=-N
|
||||||
build-depends:
|
build-depends:
|
||||||
base >=4.7 && <5
|
base >=4.7 && <5
|
||||||
|
, containers
|
||||||
, part2
|
, part2
|
||||||
, unescaping-print
|
, unescaping-print
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
@@ -65,6 +67,7 @@ test-suite part2-test
|
|||||||
ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints -threaded -rtsopts -with-rtsopts=-N
|
ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints -threaded -rtsopts -with-rtsopts=-N
|
||||||
build-depends:
|
build-depends:
|
||||||
base >=4.7 && <5
|
base >=4.7 && <5
|
||||||
|
, containers
|
||||||
, part2
|
, part2
|
||||||
, unescaping-print
|
, unescaping-print
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|||||||
@@ -1,6 +1,11 @@
|
|||||||
module Lib where
|
module Lib where
|
||||||
|
|
||||||
import Data.Char (isLetter, toLower)
|
import Data.Char (isLetter, toLower)
|
||||||
|
import Data.Map (Map)
|
||||||
|
import qualified Data.Map as Map
|
||||||
|
import Data.List (nub, tails)
|
||||||
|
import System.IO
|
||||||
|
import UnescapingPrint (ushow)
|
||||||
|
|
||||||
|
|
||||||
splitText :: String -> [[String]]
|
splitText :: String -> [[String]]
|
||||||
@@ -22,4 +27,18 @@ splitText text = filter (not . null) $ map (processSentence . words) (splitSente
|
|||||||
processSentence = filter (not . null) . map cleanWord
|
processSentence = filter (not . null) . map cleanWord
|
||||||
|
|
||||||
cleanWord :: String -> String
|
cleanWord :: String -> String
|
||||||
cleanWord = map toLower . filter isLetter
|
cleanWord = map toLower . filter isLetter
|
||||||
|
|
||||||
|
buildDictionary :: [[String]] -> Map String [String]
|
||||||
|
buildDictionary sentences =
|
||||||
|
let bigrams = [ (w1, w2) | s <- sentences, (w1:w2:_) <- tails s ]
|
||||||
|
trigrams = [ (w1, w2, w3) | s <- sentences, (w1:w2:w3:_) <- tails s ]
|
||||||
|
singleKeys = foldr (\(w1, w2) acc -> Map.insertWith (++) w1 [w2] acc) Map.empty bigrams
|
||||||
|
singleKeys' = foldr (\(w1, w2, w3) acc -> Map.insertWith (++) w1 [w2 ++ " " ++ w3] acc) singleKeys trigrams
|
||||||
|
doubleKeys = foldr (\(w1, w2, w3) acc -> Map.insertWith (++) (w1 ++ " " ++ w2) [w3] acc) Map.empty trigrams
|
||||||
|
combined = Map.unionWith (++) singleKeys' doubleKeys
|
||||||
|
in Map.map nub combined
|
||||||
|
|
||||||
|
saveDictionary :: FilePath -> Map String [String] -> IO ()
|
||||||
|
saveDictionary filePath dict = withFile filePath WriteMode $ \h ->
|
||||||
|
mapM_ (\(k,v) -> hPutStrLn h $ ushow k ++ ": " ++ ushow v) (Map.toList dict)
|
||||||
Reference in New Issue
Block a user